//-------------------------------------------------------------------------------------
// DirectXMath.h -- SIMD C++ Math library
//
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
//
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------

#pragma once

#ifndef __cplusplus
#error DirectX Math requires C++
#endif

#define DIRECTX_MATH_VERSION 318

#if defined(_MSC_VER) && (_MSC_VER < 1910)
#error DirectX Math requires Visual C++ 2017 or later.
#endif

#if defined(_MSC_VER) && !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) && (!_MANAGED) && (!_M_CEE) && (!defined(_M_IX86_FP) || (_M_IX86_FP > 1)) && !defined(_XM_NO_INTRINSICS_) && !defined(_XM_VECTORCALL_)
#define _XM_VECTORCALL_ 1
#endif

#if _XM_VECTORCALL_
#define XM_CALLCONV __vectorcall
#elif defined(__GNUC__)
#define XM_CALLCONV
#else
#define XM_CALLCONV __fastcall
#endif

#ifndef XM_DEPRECATED
#ifdef __GNUC__
#define XM_DEPRECATED __attribute__((deprecated))
#else
#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
#endif
#endif

#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX2_INTRINSICS_
#endif

#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_FMA3_INTRINSICS_
#endif

#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_F16C_INTRINSICS_
#endif

#if !defined(_XM_F16C_INTRINSICS_) && defined(__F16C__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_F16C_INTRINSICS_
#endif

#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif

#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif

#if !defined(_XM_AVX_INTRINSICS_) && defined(__AVX__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif

#if defined(_XM_AVX_INTRINSICS_) && !defined(_XM_SSE4_INTRINSICS_)
#define _XM_SSE4_INTRINSICS_
#endif

#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_SSE3_INTRINSICS_)
#define _XM_SSE3_INTRINSICS_
#endif

#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_)
#define _XM_SSE_INTRINSICS_
#endif

#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#if (defined(_M_IX86) || defined(_M_X64) || __i386__ || __x86_64__) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
#define _XM_SSE_INTRINSICS_
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
#define _XM_ARM_NEON_INTRINSICS_
#elif !defined(_XM_NO_INTRINSICS_)
#error DirectX Math does not support this target
#endif
#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_

#if defined(_XM_SSE_INTRINSICS_) && defined(_MSC_VER) && (_MSC_VER >= 1920) && !defined(__clang__) && !defined(_XM_SVML_INTRINSICS_) && !defined(_XM_DISABLE_INTEL_SVML_)
#define _XM_SVML_INTRINSICS_
#endif

#if !defined(_XM_NO_XMVECTOR_OVERLOADS_) && (defined(__clang__) || defined(__GNUC__)) && !defined(_XM_NO_INTRINSICS_)
#define _XM_NO_XMVECTOR_OVERLOADS_
#endif

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4514 4820)
// C4514/4820: Off by default noise
#endif
#include <math.h>
#include <float.h>
#ifdef _MSC_VER
#pragma warning(pop)
#endif

#ifndef _XM_NO_INTRINSICS_

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4987)
// C4987: Off by default noise
#include <intrin.h>
#pragma warning(pop)
#endif

#if (defined(__clang__) || defined(__GNUC__)) && (__x86_64__ || __i386__)
#include <cpuid.h>
#endif

#ifdef _XM_SSE_INTRINSICS_
#include <xmmintrin.h>
#include <emmintrin.h>

#ifdef _XM_SSE3_INTRINSICS_
#include <pmmintrin.h>
#endif

#ifdef _XM_SSE4_INTRINSICS_
#include <smmintrin.h>
#endif

#ifdef _XM_AVX_INTRINSICS_
#include <immintrin.h>
#endif

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC))
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
#endif // !_XM_NO_INTRINSICS_

#include <assert.h>

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4005 4668)
// C4005/4668: Old header issue
#endif
#include <stdint.h>
#ifdef _MSC_VER
#pragma warning(pop)
#endif

#if __cplusplus >= 201703L
#define XM_ALIGNED_DATA(x) alignas(x)
#define XM_ALIGNED_STRUCT(x) struct alignas(x)
#elif defined(__GNUC__)
#define XM_ALIGNED_DATA(x) __attribute__((aligned(x)))
#define XM_ALIGNED_STRUCT(x) struct __attribute__((aligned(x)))
#else
#define XM_ALIGNED_DATA(x) __declspec(align(x))
#define XM_ALIGNED_STRUCT(x) __declspec(align(x)) struct
#endif

#if (__cplusplus >= 202002L)
#include <compare>
#endif

/****************************************************************************
 *
 * Conditional intrinsics
 *
 ****************************************************************************/

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

#if defined(_XM_NO_MOVNT_)
#define XM_STREAM_PS(p, a) _mm_store_ps((p), (a))
#define XM256_STREAM_PS(p, a) _mm256_store_ps((p), (a))
#define XM_SFENCE()
#else
#define XM_STREAM_PS(p, a) _mm_stream_ps((p), (a))
#define XM256_STREAM_PS(p, a) _mm256_stream_ps((p), (a))
#define XM_SFENCE() _mm_sfence()
#endif

#if defined(_XM_FMA3_INTRINSICS_)
#define XM_FMADD_PS(a, b, c) _mm_fmadd_ps((a), (b), (c))
#define XM_FNMADD_PS(a, b, c) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FMADD_PS(a, b, c) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
#define XM_FNMADD_PS(a, b, c) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif

#if defined(_XM_AVX_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
#define XM_PERMUTE_PS(v, c) _mm_permute_ps((v), c)
#else
#define XM_PERMUTE_PS(v, c) _mm_shuffle_ps((v), (v), c)
#endif

#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ < 11)
#define XM_LOADU_SI16(p) _mm_cvtsi32_si128(*reinterpret_cast<unsigned short const *>(p))
#else
#define XM_LOADU_SI16(p) _mm_loadu_si16(p)
#endif

#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

#if defined(__clang__) || defined(__GNUC__)
#define XM_PREFETCH(a) __builtin_prefetch(a)
#elif defined(_MSC_VER)
#define XM_PREFETCH(a) __prefetch(a)
#else
#define XM_PREFETCH(a)
#endif

#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_

namespace DirectX
{

    /****************************************************************************
     *
     * Constant definitions
     *
     ****************************************************************************/

#if defined(__XNAMATH_H__) && defined(XM_PI)
#undef XM_PI
#undef XM_2PI
#undef XM_1DIVPI
#undef XM_1DIV2PI
#undef XM_PIDIV2
#undef XM_PIDIV4
#undef XM_SELECT_0
#undef XM_SELECT_1
#undef XM_PERMUTE_0X
#undef XM_PERMUTE_0Y
#undef XM_PERMUTE_0Z
#undef XM_PERMUTE_0W
#undef XM_PERMUTE_1X
#undef XM_PERMUTE_1Y
#undef XM_PERMUTE_1Z
#undef XM_PERMUTE_1W
#undef XM_CRMASK_CR6
#undef XM_CRMASK_CR6TRUE
#undef XM_CRMASK_CR6FALSE
#undef XM_CRMASK_CR6BOUNDS
#undef XM_CACHE_LINE_SIZE
#endif

    constexpr float XM_PI = 3.141592654f;
    constexpr float XM_2PI = 6.283185307f;
    constexpr float XM_1DIVPI = 0.318309886f;
    constexpr float XM_1DIV2PI = 0.159154943f;
    constexpr float XM_PIDIV2 = 1.570796327f;
    constexpr float XM_PIDIV4 = 0.785398163f;

    constexpr uint32_t XM_SELECT_0 = 0x00000000;
    constexpr uint32_t XM_SELECT_1 = 0xFFFFFFFF;

    constexpr uint32_t XM_PERMUTE_0X = 0;
    constexpr uint32_t XM_PERMUTE_0Y = 1;
    constexpr uint32_t XM_PERMUTE_0Z = 2;
    constexpr uint32_t XM_PERMUTE_0W = 3;
    constexpr uint32_t XM_PERMUTE_1X = 4;
    constexpr uint32_t XM_PERMUTE_1Y = 5;
    constexpr uint32_t XM_PERMUTE_1Z = 6;
    constexpr uint32_t XM_PERMUTE_1W = 7;

    constexpr uint32_t XM_SWIZZLE_X = 0;
    constexpr uint32_t XM_SWIZZLE_Y = 1;
    constexpr uint32_t XM_SWIZZLE_Z = 2;
    constexpr uint32_t XM_SWIZZLE_W = 3;

    constexpr uint32_t XM_CRMASK_CR6 = 0x000000F0;
    constexpr uint32_t XM_CRMASK_CR6TRUE = 0x00000080;
    constexpr uint32_t XM_CRMASK_CR6FALSE = 0x00000020;
    constexpr uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE;

#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
    constexpr size_t XM_CACHE_LINE_SIZE = 128;
#else
    constexpr size_t XM_CACHE_LINE_SIZE = 64;
#endif

    /****************************************************************************
     *
     * Macros
     *
     ****************************************************************************/

#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue)
#undef XMComparisonAllTrue
#undef XMComparisonAnyTrue
#undef XMComparisonAllFalse
#undef XMComparisonAnyFalse
#undef XMComparisonMixed
#undef XMComparisonAllInBounds
#undef XMComparisonAnyOutOfBounds
#endif

    // Unit conversion

    constexpr float XMConvertToRadians(float fDegrees) noexcept { return fDegrees * (XM_PI / 180.0f); }
    constexpr float XMConvertToDegrees(float fRadians) noexcept { return fRadians * (180.0f / XM_PI); }

    // Condition register evaluation proceeding a recording (R) comparison

    constexpr bool XMComparisonAllTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE; }
    constexpr bool XMComparisonAnyTrue(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE; }
    constexpr bool XMComparisonAllFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE; }
    constexpr bool XMComparisonAnyFalse(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE; }
    constexpr bool XMComparisonMixed(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6) == 0; }
    constexpr bool XMComparisonAllInBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS; }
    constexpr bool XMComparisonAnyOutOfBounds(uint32_t CR) noexcept { return (CR & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS; }

    /****************************************************************************
     *
     * Data types
     *
     ****************************************************************************/

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4068 4201 4365 4324 4820)
    // C4068: ignore unknown pragmas
    // C4201: nonstandard extension used : nameless struct/union
    // C4365: Off by default noise
    // C4324/4820: padding warnings
#endif

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
#endif

//------------------------------------------------------------------------------
#if defined(_XM_NO_INTRINSICS_)
    struct __vector4
    {
        union
        {
            float vector4_f32[4];
            uint32_t vector4_u32[4];
        };
    };
#endif // _XM_NO_INTRINSICS_

    //------------------------------------------------------------------------------
    // Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte
    // boundary and mapped to hardware vector registers
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    using XMVECTOR = __m128;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    using XMVECTOR = float32x4_t;
#else
    using XMVECTOR = __vector4;
#endif

    // Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, ARM64, and vector call; by reference otherwise
#if (defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) || _XM_VECTORCALL_ || __i386__ || __arm__ || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
    typedef const XMVECTOR FXMVECTOR;
#else
    typedef const XMVECTOR &FXMVECTOR;
#endif

    // Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM, ARM64, and vector call; by reference otherwise
#if (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __arm__ || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
    typedef const XMVECTOR GXMVECTOR;
#else
    typedef const XMVECTOR &GXMVECTOR;
#endif

    // Fix-up for (5th & 6th) XMVECTOR parameter to pass in-register for ARM64 and vector call; by reference otherwise
#if (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
    typedef const XMVECTOR HXMVECTOR;
#else
    typedef const XMVECTOR &HXMVECTOR;
#endif

    // Fix-up for (7th+) XMVECTOR parameters to pass by reference
    typedef const XMVECTOR &CXMVECTOR;

    //------------------------------------------------------------------------------
    // Conversion types for constants
    XM_ALIGNED_STRUCT(16)
    XMVECTORF32
    {
        union
        {
            float f[4];
            XMVECTOR v;
        };

        inline operator XMVECTOR() const noexcept { return v; }
        inline operator const float *() const noexcept { return f; }
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
    };

    XM_ALIGNED_STRUCT(16)
    XMVECTORI32
    {
        union
        {
            int32_t i[4];
            XMVECTOR v;
        };

        inline operator XMVECTOR() const noexcept { return v; }
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
    };

    XM_ALIGNED_STRUCT(16)
    XMVECTORU8
    {
        union
        {
            uint8_t u[16];
            XMVECTOR v;
        };

        inline operator XMVECTOR() const noexcept { return v; }
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
    };

    XM_ALIGNED_STRUCT(16)
    XMVECTORU32
    {
        union
        {
            uint32_t u[4];
            XMVECTOR v;
        };

        inline operator XMVECTOR() const noexcept { return v; }
#ifdef _XM_NO_INTRINSICS_
#elif defined(_XM_SSE_INTRINSICS_)
        inline operator __m128i() const noexcept { return _mm_castps_si128(v); }
        inline operator __m128d() const noexcept { return _mm_castps_pd(v); }
#elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(__GNUC__) || defined(_ARM64_DISTINCT_NEON_TYPES))
        inline operator int32x4_t() const noexcept { return vreinterpretq_s32_f32(v); }
        inline operator uint32x4_t() const noexcept { return vreinterpretq_u32_f32(v); }
#endif
    };

    //------------------------------------------------------------------------------
    // Vector operators

#ifndef _XM_NO_XMVECTOR_OVERLOADS_
    XMVECTOR XM_CALLCONV operator+(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV operator-(FXMVECTOR V) noexcept;

    XMVECTOR &XM_CALLCONV operator+=(XMVECTOR &V1, FXMVECTOR V2) noexcept;
    XMVECTOR &XM_CALLCONV operator-=(XMVECTOR &V1, FXMVECTOR V2) noexcept;
    XMVECTOR &XM_CALLCONV operator*=(XMVECTOR &V1, FXMVECTOR V2) noexcept;
    XMVECTOR &XM_CALLCONV operator/=(XMVECTOR &V1, FXMVECTOR V2) noexcept;

    XMVECTOR &operator*=(XMVECTOR &V, float S) noexcept;
    XMVECTOR &operator/=(XMVECTOR &V, float S) noexcept;

    XMVECTOR XM_CALLCONV operator+(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV operator-(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV operator*(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV operator/(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV operator*(FXMVECTOR V, float S) noexcept;
    XMVECTOR XM_CALLCONV operator*(float S, FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV operator/(FXMVECTOR V, float S) noexcept;
#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */

    //------------------------------------------------------------------------------
    // Matrix type: Sixteen 32 bit floating point components aligned on a
    // 16 byte boundary and mapped to four hardware vector registers

    struct XMMATRIX;

    // Fix-up for (1st) XMMATRIX parameter to pass in-register for ARM64 and vector call; by reference otherwise
#if (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || _XM_VECTORCALL_ || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
    typedef const XMMATRIX FXMMATRIX;
#else
    typedef const XMMATRIX &FXMMATRIX;
#endif

    // Fix-up for (2nd+) XMMATRIX parameters to pass by reference
    typedef const XMMATRIX &CXMMATRIX;

#ifdef _XM_NO_INTRINSICS_
    struct XMMATRIX
#else
    XM_ALIGNED_STRUCT(16)
    XMMATRIX
#endif
    {
#ifdef _XM_NO_INTRINSICS_
        union
        {
            XMVECTOR r[4];
            struct
            {
                float _11, _12, _13, _14;
                float _21, _22, _23, _24;
                float _31, _32, _33, _34;
                float _41, _42, _43, _44;
            };
            float m[4][4];
        };
#else
        XMVECTOR r[4];
#endif

        XMMATRIX() = default;

        XMMATRIX(const XMMATRIX &) = default;

#if defined(_MSC_VER) && (_MSC_FULL_VER < 191426431)
        XMMATRIX &operator=(const XMMATRIX &M) noexcept
        {
            r[0] = M.r[0];
            r[1] = M.r[1];
            r[2] = M.r[2];
            r[3] = M.r[3];
            return *this;
        }
#else
        XMMATRIX &operator=(const XMMATRIX &) = default;

        XMMATRIX(XMMATRIX &&) = default;
        XMMATRIX &operator=(XMMATRIX &&) = default;
#endif

        constexpr XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, CXMVECTOR R3) noexcept : r{R0, R1, R2, R3} {}
        XMMATRIX(float m00, float m01, float m02, float m03,
                 float m10, float m11, float m12, float m13,
                 float m20, float m21, float m22, float m23,
                 float m30, float m31, float m32, float m33) noexcept;
        explicit XMMATRIX(const float *pArray) noexcept;

#ifdef _XM_NO_INTRINSICS_
        float operator()(size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
        float &operator()(size_t Row, size_t Column) noexcept { return m[Row][Column]; }
#endif

        XMMATRIX operator+() const noexcept { return *this; }
        XMMATRIX operator-() const noexcept;

        XMMATRIX &XM_CALLCONV operator+=(FXMMATRIX M) noexcept;
        XMMATRIX &XM_CALLCONV operator-=(FXMMATRIX M) noexcept;
        XMMATRIX &XM_CALLCONV operator*=(FXMMATRIX M) noexcept;
        XMMATRIX &operator*=(float S) noexcept;
        XMMATRIX &operator/=(float S) noexcept;

        XMMATRIX XM_CALLCONV operator+(FXMMATRIX M) const noexcept;
        XMMATRIX XM_CALLCONV operator-(FXMMATRIX M) const noexcept;
        XMMATRIX XM_CALLCONV operator*(FXMMATRIX M) const noexcept;
        XMMATRIX operator*(float S) const noexcept;
        XMMATRIX operator/(float S) const noexcept;

        friend XMMATRIX XM_CALLCONV operator*(float S, FXMMATRIX M) noexcept;
    };

    //------------------------------------------------------------------------------
    // 2D Vector; 32 bit floating point components
    struct XMFLOAT2
    {
        float x;
        float y;

        XMFLOAT2() = default;

        XMFLOAT2(const XMFLOAT2 &) = default;
        XMFLOAT2 &operator=(const XMFLOAT2 &) = default;

        XMFLOAT2(XMFLOAT2 &&) = default;
        XMFLOAT2 &operator=(XMFLOAT2 &&) = default;

        constexpr XMFLOAT2(float _x, float _y) noexcept : x(_x), y(_y) {}
        explicit XMFLOAT2(const float *pArray) noexcept : x(pArray[0]), y(pArray[1]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT2 &) const = default;
        auto operator<=>(const XMFLOAT2 &) const = default;
#endif
    };

    // 2D Vector; 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT2A : public XMFLOAT2
    {
        using XMFLOAT2::XMFLOAT2;
    };

    //------------------------------------------------------------------------------
    // 2D Vector; 32 bit signed integer components
    struct XMINT2
    {
        int32_t x;
        int32_t y;

        XMINT2() = default;

        XMINT2(const XMINT2 &) = default;
        XMINT2 &operator=(const XMINT2 &) = default;

        XMINT2(XMINT2 &&) = default;
        XMINT2 &operator=(XMINT2 &&) = default;

        constexpr XMINT2(int32_t _x, int32_t _y) noexcept : x(_x), y(_y) {}
        explicit XMINT2(const int32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMINT2 &) const = default;
        auto operator<=>(const XMINT2 &) const = default;
#endif
    };

    // 2D Vector; 32 bit unsigned integer components
    struct XMUINT2
    {
        uint32_t x;
        uint32_t y;

        XMUINT2() = default;

        XMUINT2(const XMUINT2 &) = default;
        XMUINT2 &operator=(const XMUINT2 &) = default;

        XMUINT2(XMUINT2 &&) = default;
        XMUINT2 &operator=(XMUINT2 &&) = default;

        constexpr XMUINT2(uint32_t _x, uint32_t _y) noexcept : x(_x), y(_y) {}
        explicit XMUINT2(const uint32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMUINT2 &) const = default;
        auto operator<=>(const XMUINT2 &) const = default;
#endif
    };

    //------------------------------------------------------------------------------
    // 3D Vector; 32 bit floating point components
    struct XMFLOAT3
    {
        float x;
        float y;
        float z;

        XMFLOAT3() = default;

        XMFLOAT3(const XMFLOAT3 &) = default;
        XMFLOAT3 &operator=(const XMFLOAT3 &) = default;

        XMFLOAT3(XMFLOAT3 &&) = default;
        XMFLOAT3 &operator=(XMFLOAT3 &&) = default;

        constexpr XMFLOAT3(float _x, float _y, float _z) noexcept : x(_x), y(_y), z(_z) {}
        explicit XMFLOAT3(const float *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}
    };

    // 3D Vector; 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT3A : public XMFLOAT3
    {
        using XMFLOAT3::XMFLOAT3;
    };

    //------------------------------------------------------------------------------
    // 3D Vector; 32 bit signed integer components
    struct XMINT3
    {
        int32_t x;
        int32_t y;
        int32_t z;

        XMINT3() = default;

        XMINT3(const XMINT3 &) = default;
        XMINT3 &operator=(const XMINT3 &) = default;

        XMINT3(XMINT3 &&) = default;
        XMINT3 &operator=(XMINT3 &&) = default;

        constexpr XMINT3(int32_t _x, int32_t _y, int32_t _z) noexcept : x(_x), y(_y), z(_z) {}
        explicit XMINT3(const int32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMINT3 &) const = default;
        auto operator<=>(const XMINT3 &) const = default;
#endif
    };

    // 3D Vector; 32 bit unsigned integer components
    struct XMUINT3
    {
        uint32_t x;
        uint32_t y;
        uint32_t z;

        XMUINT3() = default;

        XMUINT3(const XMUINT3 &) = default;
        XMUINT3 &operator=(const XMUINT3 &) = default;

        XMUINT3(XMUINT3 &&) = default;
        XMUINT3 &operator=(XMUINT3 &&) = default;

        constexpr XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) noexcept : x(_x), y(_y), z(_z) {}
        explicit XMUINT3(const uint32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMUINT3 &) const = default;
        auto operator<=>(const XMUINT3 &) const = default;
#endif
    };

    //------------------------------------------------------------------------------
    // 4D Vector; 32 bit floating point components
    struct XMFLOAT4
    {
        float x;
        float y;
        float z;
        float w;

        XMFLOAT4() = default;

        XMFLOAT4(const XMFLOAT4 &) = default;
        XMFLOAT4 &operator=(const XMFLOAT4 &) = default;

        XMFLOAT4(XMFLOAT4 &&) = default;
        XMFLOAT4 &operator=(XMFLOAT4 &&) = default;

        constexpr XMFLOAT4(float _x, float _y, float _z, float _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
        explicit XMFLOAT4(const float *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT4 &) const = default;
        auto operator<=>(const XMFLOAT4 &) const = default;
#endif
    };

    // 4D Vector; 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT4A : public XMFLOAT4
    {
        using XMFLOAT4::XMFLOAT4;
    };

    //------------------------------------------------------------------------------
    // 4D Vector; 32 bit signed integer components
    struct XMINT4
    {
        int32_t x;
        int32_t y;
        int32_t z;
        int32_t w;

        XMINT4() = default;

        XMINT4(const XMINT4 &) = default;
        XMINT4 &operator=(const XMINT4 &) = default;

        XMINT4(XMINT4 &&) = default;
        XMINT4 &operator=(XMINT4 &&) = default;

        constexpr XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
        explicit XMINT4(const int32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMINT4 &) const = default;
        auto operator<=>(const XMINT4 &) const = default;
#endif
    };

    // 4D Vector; 32 bit unsigned integer components
    struct XMUINT4
    {
        uint32_t x;
        uint32_t y;
        uint32_t z;
        uint32_t w;

        XMUINT4() = default;

        XMUINT4(const XMUINT4 &) = default;
        XMUINT4 &operator=(const XMUINT4 &) = default;

        XMUINT4(XMUINT4 &&) = default;
        XMUINT4 &operator=(XMUINT4 &&) = default;

        constexpr XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) noexcept : x(_x), y(_y), z(_z), w(_w) {}
        explicit XMUINT4(const uint32_t *pArray) noexcept : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {}

#if (__cplusplus >= 202002L)
        bool operator==(const XMUINT4 &) const = default;
        auto operator<=>(const XMUINT4 &) const = default;
#endif
    };

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
#pragma clang diagnostic ignored "-Wnested-anon-types"
#pragma clang diagnostic ignored "-Wunknown-warning-option"
#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
#endif

    //------------------------------------------------------------------------------
    // 3x3 Matrix: 32 bit floating point components
    struct XMFLOAT3X3
    {
        union
        {
            struct
            {
                float _11, _12, _13;
                float _21, _22, _23;
                float _31, _32, _33;
            };
            float m[3][3];
        };

        XMFLOAT3X3() = default;

        XMFLOAT3X3(const XMFLOAT3X3 &) = default;
        XMFLOAT3X3 &operator=(const XMFLOAT3X3 &) = default;

        XMFLOAT3X3(XMFLOAT3X3 &&) = default;
        XMFLOAT3X3 &operator=(XMFLOAT3X3 &&) = default;

        constexpr XMFLOAT3X3(float m00, float m01, float m02,
                             float m10, float m11, float m12,
                             float m20, float m21, float m22) noexcept
            : _11(m00), _12(m01), _13(m02),
              _21(m10), _22(m11), _23(m12),
              _31(m20), _32(m21), _33(m22) {}
        explicit XMFLOAT3X3(const float *pArray) noexcept;

        float operator()(size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
        float &operator()(size_t Row, size_t Column) noexcept { return m[Row][Column]; }

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT3X3 &) const = default;
        auto operator<=>(const XMFLOAT3X3 &) const = default;
#endif
    };

    //------------------------------------------------------------------------------
    // 4x3 Row-major Matrix: 32 bit floating point components
    struct XMFLOAT4X3
    {
        union
        {
            struct
            {
                float _11, _12, _13;
                float _21, _22, _23;
                float _31, _32, _33;
                float _41, _42, _43;
            };
            float m[4][3];
            float f[12];
        };

        XMFLOAT4X3() = default;

        XMFLOAT4X3(const XMFLOAT4X3 &) = default;
        XMFLOAT4X3 &operator=(const XMFLOAT4X3 &) = default;

        XMFLOAT4X3(XMFLOAT4X3 &&) = default;
        XMFLOAT4X3 &operator=(XMFLOAT4X3 &&) = default;

        constexpr XMFLOAT4X3(float m00, float m01, float m02,
                             float m10, float m11, float m12,
                             float m20, float m21, float m22,
                             float m30, float m31, float m32) noexcept
            : _11(m00), _12(m01), _13(m02),
              _21(m10), _22(m11), _23(m12),
              _31(m20), _32(m21), _33(m22),
              _41(m30), _42(m31), _43(m32) {}
        explicit XMFLOAT4X3(const float *pArray) noexcept;

        float operator()(size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
        float &operator()(size_t Row, size_t Column) noexcept { return m[Row][Column]; }

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT4X3 &) const = default;
        auto operator<=>(const XMFLOAT4X3 &) const = default;
#endif
    };

    // 4x3 Row-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT4X3A : public XMFLOAT4X3
    {
        using XMFLOAT4X3::XMFLOAT4X3;
    };

    //------------------------------------------------------------------------------
    // 3x4 Column-major Matrix: 32 bit floating point components
    struct XMFLOAT3X4
    {
        union
        {
            struct
            {
                float _11, _12, _13, _14;
                float _21, _22, _23, _24;
                float _31, _32, _33, _34;
            };
            float m[3][4];
            float f[12];
        };

        XMFLOAT3X4() = default;

        XMFLOAT3X4(const XMFLOAT3X4 &) = default;
        XMFLOAT3X4 &operator=(const XMFLOAT3X4 &) = default;

        XMFLOAT3X4(XMFLOAT3X4 &&) = default;
        XMFLOAT3X4 &operator=(XMFLOAT3X4 &&) = default;

        constexpr XMFLOAT3X4(float m00, float m01, float m02, float m03,
                             float m10, float m11, float m12, float m13,
                             float m20, float m21, float m22, float m23) noexcept
            : _11(m00), _12(m01), _13(m02), _14(m03),
              _21(m10), _22(m11), _23(m12), _24(m13),
              _31(m20), _32(m21), _33(m22), _34(m23) {}
        explicit XMFLOAT3X4(const float *pArray) noexcept;

        float operator()(size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
        float &operator()(size_t Row, size_t Column) noexcept { return m[Row][Column]; }

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT3X4 &) const = default;
        auto operator<=>(const XMFLOAT3X4 &) const = default;
#endif
    };

    // 3x4 Column-major Matrix: 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT3X4A : public XMFLOAT3X4
    {
        using XMFLOAT3X4::XMFLOAT3X4;
    };

    //------------------------------------------------------------------------------
    // 4x4 Matrix: 32 bit floating point components
    struct XMFLOAT4X4
    {
        union
        {
            struct
            {
                float _11, _12, _13, _14;
                float _21, _22, _23, _24;
                float _31, _32, _33, _34;
                float _41, _42, _43, _44;
            };
            float m[4][4];
        };

        XMFLOAT4X4() = default;

        XMFLOAT4X4(const XMFLOAT4X4 &) = default;
        XMFLOAT4X4 &operator=(const XMFLOAT4X4 &) = default;

        XMFLOAT4X4(XMFLOAT4X4 &&) = default;
        XMFLOAT4X4 &operator=(XMFLOAT4X4 &&) = default;

        constexpr XMFLOAT4X4(float m00, float m01, float m02, float m03,
                             float m10, float m11, float m12, float m13,
                             float m20, float m21, float m22, float m23,
                             float m30, float m31, float m32, float m33) noexcept
            : _11(m00), _12(m01), _13(m02), _14(m03),
              _21(m10), _22(m11), _23(m12), _24(m13),
              _31(m20), _32(m21), _33(m22), _34(m23),
              _41(m30), _42(m31), _43(m32), _44(m33) {}
        explicit XMFLOAT4X4(const float *pArray) noexcept;

        float operator()(size_t Row, size_t Column) const noexcept { return m[Row][Column]; }
        float &operator()(size_t Row, size_t Column) noexcept { return m[Row][Column]; }

#if (__cplusplus >= 202002L)
        bool operator==(const XMFLOAT4X4 &) const = default;
        auto operator<=>(const XMFLOAT4X4 &) const = default;
#endif
    };

    // 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary
    XM_ALIGNED_STRUCT(16)
    XMFLOAT4X4A : public XMFLOAT4X4
    {
        using XMFLOAT4X4::XMFLOAT4X4;
    };

    ////////////////////////////////////////////////////////////////////////////////

#ifdef __clang__
#pragma clang diagnostic pop
#endif
#ifdef _PREFAST_
#pragma prefast(pop)
#endif
#ifdef _MSC_VER
#pragma warning(pop)
#endif

    /****************************************************************************
     *
     * Data conversion operations
     *
     ****************************************************************************/

    XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent) noexcept;
    XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept;
    XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent) noexcept;
    XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent) noexcept;

#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant)
#undef XMVectorSetBinaryConstant
#undef XMVectorSplatConstant
#undef XMVectorSplatConstantInt
#endif

    XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept;

    /****************************************************************************
     *
     * Load operations
     *
     ****************************************************************************/

    XMVECTOR XM_CALLCONV XMLoadInt( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat( const float *pSource) noexcept;

    XMVECTOR XM_CALLCONV XMLoadInt2( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadInt2A( const uint32_t *PSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat2( const XMFLOAT2 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat2A( const XMFLOAT2A *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadSInt2( const XMINT2 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadUInt2( const XMUINT2 *pSource) noexcept;

    XMVECTOR XM_CALLCONV XMLoadInt3( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadInt3A( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat3( const XMFLOAT3 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat3A( const XMFLOAT3A *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadSInt3( const XMINT3 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadUInt3( const XMUINT3 *pSource) noexcept;

    XMVECTOR XM_CALLCONV XMLoadInt4( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadInt4A( const uint32_t *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat4( const XMFLOAT4 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadFloat4A( const XMFLOAT4A *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadSInt4( const XMINT4 *pSource) noexcept;
    XMVECTOR XM_CALLCONV XMLoadUInt4( const XMUINT4 *pSource) noexcept;

    XMMATRIX XM_CALLCONV XMLoadFloat3x3( const XMFLOAT3X3 *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat4x3( const XMFLOAT4X3 *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat4x3A( const XMFLOAT4X3A *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat3x4( const XMFLOAT3X4 *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat3x4A( const XMFLOAT3X4A *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat4x4( const XMFLOAT4X4 *pSource) noexcept;
    XMMATRIX XM_CALLCONV XMLoadFloat4x4A( const XMFLOAT4X4A *pSource) noexcept;

    /****************************************************************************
     *
     * Store operations
     *
     ****************************************************************************/

    void XM_CALLCONV XMStoreInt( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat( float *pDestination,  FXMVECTOR V) noexcept;

    void XM_CALLCONV XMStoreInt2( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreInt2A( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat2( XMFLOAT2 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat2A( XMFLOAT2A *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreSInt2( XMINT2 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreUInt2( XMUINT2 *pDestination,  FXMVECTOR V) noexcept;

    void XM_CALLCONV XMStoreInt3( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreInt3A( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat3( XMFLOAT3 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat3A( XMFLOAT3A *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreSInt3( XMINT3 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreUInt3( XMUINT3 *pDestination,  FXMVECTOR V) noexcept;

    void XM_CALLCONV XMStoreInt4( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreInt4A( uint32_t *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat4( XMFLOAT4 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreFloat4A( XMFLOAT4A *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreSInt4( XMINT4 *pDestination,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMStoreUInt4( XMUINT4 *pDestination,  FXMVECTOR V) noexcept;

    void XM_CALLCONV XMStoreFloat3x3( XMFLOAT3X3 *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat4x3( XMFLOAT4X3 *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat4x3A( XMFLOAT4X3A *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat3x4( XMFLOAT3X4 *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat3x4A( XMFLOAT3X4A *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat4x4( XMFLOAT4X4 *pDestination,  FXMMATRIX M) noexcept;
    void XM_CALLCONV XMStoreFloat4x4A( XMFLOAT4X4A *pDestination,  FXMMATRIX M) noexcept;

    /****************************************************************************
     *
     * General vector operations
     *
     ****************************************************************************/

    XMVECTOR XM_CALLCONV XMVectorZero() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSet(float x, float y, float z, float w) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReplicatePtr( const float *pValue) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr( const uint32_t *pValue) noexcept;
    XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept;
    XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept;
    XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept;

    float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept;
    float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept;
    float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept;
    float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept;
    float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept;

    void XM_CALLCONV XMVectorGetByIndexPtr( float *f,  FXMVECTOR V,  size_t i) noexcept;
    void XM_CALLCONV XMVectorGetXPtr( float *x,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetYPtr( float *y,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetZPtr( float *z,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetWPtr( float *w,  FXMVECTOR V) noexcept;

    uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept;
    uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept;
    uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept;
    uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept;
    uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept;

    void XM_CALLCONV XMVectorGetIntByIndexPtr( uint32_t *x,  FXMVECTOR V,  size_t i) noexcept;
    void XM_CALLCONV XMVectorGetIntXPtr( uint32_t *x,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetIntYPtr( uint32_t *y,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetIntZPtr( uint32_t *z,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorGetIntWPtr( uint32_t *w,  FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept;

    XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr( FXMVECTOR V,  const float *f,  size_t i) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetXPtr( FXMVECTOR V,  const float *x) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetYPtr( FXMVECTOR V,  const float *y) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetZPtr( FXMVECTOR V,  const float *z) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetWPtr( FXMVECTOR V,  const float *w) noexcept;

    XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept;

    XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr( FXMVECTOR V,  const uint32_t *x,  size_t i) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntXPtr( FXMVECTOR V,  const uint32_t *x) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntYPtr( FXMVECTOR V,  const uint32_t *y) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntZPtr( FXMVECTOR V,  const uint32_t *z) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSetIntWPtr( FXMVECTOR V,  const uint32_t *w) noexcept;

#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle)
#undef XMVectorSwizzle
#endif

    XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3) noexcept;
    XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2) noexcept;

#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft)
#undef XMVectorShiftLeft
#undef XMVectorRotateLeft
#undef XMVectorRotateRight
#undef XMVectorInsert
#endif

    XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept;
    XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept;
    XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept;
    XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
                                        uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept;

    XMVECTOR XM_CALLCONV XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorEqualR( uint32_t *pCR,  FXMVECTOR V1,  FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorEqualIntR( uint32_t *pCR,  FXMVECTOR V,  FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
    XMVECTOR XM_CALLCONV XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorGreaterR( uint32_t *pCR,  FXMVECTOR V1,  FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR( uint32_t *pCR,  FXMVECTOR V1,  FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLess(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;
    XMVECTOR XM_CALLCONV XMVectorInBoundsR( uint32_t *pCR,  FXMVECTOR V,  FXMVECTOR Bounds) noexcept;

    XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVectorMin(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMax(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;

    XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
    XMVECTOR XM_CALLCONV XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
    XMVECTOR XM_CALLCONV XMVectorScale(FXMVECTOR V, float ScaleFactor) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorPow(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorMod(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorSinCos( XMVECTOR *pSin,  XMVECTOR *pCos,  FXMVECTOR V) noexcept;
    void XM_CALLCONV XMVectorSinCosEst( XMVECTOR *pSin,  XMVECTOR *pCos,  FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVectorATan2(FXMVECTOR Y, FXMVECTOR X) noexcept;
    XMVECTOR XM_CALLCONV XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t) noexcept;
    XMVECTOR XM_CALLCONV XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T) noexcept;
    XMVECTOR XM_CALLCONV XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t) noexcept;
    XMVECTOR XM_CALLCONV XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, HXMVECTOR T) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t) noexcept;
    XMVECTOR XM_CALLCONV XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, HXMVECTOR T) noexcept;
    XMVECTOR XM_CALLCONV XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g) noexcept;
    XMVECTOR XM_CALLCONV XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, HXMVECTOR G) noexcept;

    /****************************************************************************
     *
     * 2D vector operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
    bool XM_CALLCONV XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;

    bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept;
    bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept;
    XMVECTOR XM_CALLCONV XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2) noexcept;
    XMVECTOR XM_CALLCONV XMVector2Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT4 *XM_CALLCONV XMVector2TransformStream(XMFLOAT4 *pOutputStream,
                                                    size_t OutputStride,
                                                   const XMFLOAT2 *pInputStream,
                                                    size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;
    XMVECTOR XM_CALLCONV XMVector2TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT2 *XM_CALLCONV XMVector2TransformCoordStream( XMFLOAT2 *pOutputStream,
                                                         size_t OutputStride,
                                                         const XMFLOAT2 *pInputStream,
                                                         size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;
    XMVECTOR XM_CALLCONV XMVector2TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT2 *XM_CALLCONV XMVector2TransformNormalStream( XMFLOAT2 *pOutputStream,
                                                          size_t OutputStride,
                                                          const XMFLOAT2 *pInputStream,
                                                          size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;

    /****************************************************************************
     *
     * 3D vector operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
    bool XM_CALLCONV XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;

    bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept;
    bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point) noexcept;
    void XM_CALLCONV XMVector3ComponentsFromNormal( XMVECTOR *pParallel,  XMVECTOR *pPerpendicular,  FXMVECTOR V,  FXMVECTOR Normal) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept;
    XMVECTOR XM_CALLCONV XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT4 *XM_CALLCONV XMVector3TransformStream( XMFLOAT4 *pOutputStream,
                                                    size_t OutputStride,
                                                    const XMFLOAT3 *pInputStream,
                                                    size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;
    XMVECTOR XM_CALLCONV XMVector3TransformCoord(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT3 *XM_CALLCONV XMVector3TransformCoordStream( XMFLOAT3 *pOutputStream,
                                                         size_t OutputStride,
                                                         const XMFLOAT3 *pInputStream,
                                                         size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;
    XMVECTOR XM_CALLCONV XMVector3TransformNormal(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT3 *XM_CALLCONV XMVector3TransformNormalStream( XMFLOAT3 *pOutputStream,
                                                          size_t OutputStride,
                                                          const XMFLOAT3 *pInputStream,
                                                          size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
                                          FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept;
    XMFLOAT3 *XM_CALLCONV XMVector3ProjectStream( XMFLOAT3 *pOutputStream,
                                                  size_t OutputStride,
                                                  const XMFLOAT3 *pInputStream,
                                                  size_t InputStride,  size_t VectorCount,
                                                  float ViewportX,  float ViewportY,  float ViewportWidth,  float ViewportHeight,  float ViewportMinZ,  float ViewportMaxZ,
                                                  FXMMATRIX Projection,  CXMMATRIX View,  CXMMATRIX World) noexcept;
    XMVECTOR XM_CALLCONV XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ,
                                            FXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World) noexcept;
    XMFLOAT3 *XM_CALLCONV XMVector3UnprojectStream( XMFLOAT3 *pOutputStream,
                                                    size_t OutputStride,
                                                    const XMFLOAT3 *pInputStream,
                                                    size_t InputStride,  size_t VectorCount,
                                                    float ViewportX,  float ViewportY,  float ViewportWidth,  float ViewportHeight,  float ViewportMinZ,  float ViewportMaxZ,
                                                    FXMMATRIX Projection,  CXMMATRIX View,  CXMMATRIX World) noexcept;

    /****************************************************************************
     *
     * 4D vector operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon) noexcept;
    bool XM_CALLCONV XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    uint32_t XM_CALLCONV XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4Less(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    bool XM_CALLCONV XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds) noexcept;

    bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept;
    bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept;

    XMVECTOR XM_CALLCONV XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3) noexcept;
    XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2) noexcept;
    XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMVector4Transform(FXMVECTOR V, FXMMATRIX M) noexcept;
    XMFLOAT4 *XM_CALLCONV XMVector4TransformStream( XMFLOAT4 *pOutputStream,
                                                    size_t OutputStride,
                                                    const XMFLOAT4 *pInputStream,
                                                    size_t InputStride,  size_t VectorCount,  FXMMATRIX M) noexcept;

    /****************************************************************************
     *
     * Matrix operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept;
    bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept;
    bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept;

    XMMATRIX XM_CALLCONV XMMatrixMultiply(FXMMATRIX M1, CXMMATRIX M2) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose(FXMMATRIX M1, CXMMATRIX M2) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixInverse( XMVECTOR *pDeterminant,  FXMMATRIX M) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct(FXMVECTOR V1, FXMVECTOR V2) noexcept;
    XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept;

     bool XM_CALLCONV XMMatrixDecompose( XMVECTOR *outScale,  XMVECTOR *outRotQuat,  XMVECTOR *outTrans,  FXMMATRIX M) noexcept;

    XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept;
    XMMATRIX XM_CALLCONV XMMatrixSet(float m00, float m01, float m02, float m03,
                                     float m10, float m11, float m12, float m13,
                                     float m20, float m21, float m22, float m23,
                                     float m30, float m31, float m32, float m33) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept;

    // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
    XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept;

    // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z)
    XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;

    XMMATRIX XM_CALLCONV XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixRotationAxis(FXMVECTOR Axis, float Angle) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling,
                                                  FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling,
                                                GXMVECTOR RotationOrigin, HXMVECTOR RotationQuaternion, HXMVECTOR Translation) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition) noexcept;

    XMMATRIX XM_CALLCONV XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH(float FovAngleY, float AspectRatio, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;
    XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ) noexcept;

    /****************************************************************************
     *
     * Quaternion operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
    bool XM_CALLCONV XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;

    bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept;
    bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept;
    bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept;

    XMVECTOR XM_CALLCONV XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, HXMVECTOR T) noexcept;
    void XM_CALLCONV XMQuaternionSquadSetup( XMVECTOR *pA,  XMVECTOR *pB,  XMVECTOR *pC,  FXMVECTOR Q0,  FXMVECTOR Q1,  FXMVECTOR Q2,  GXMVECTOR Q3) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, HXMVECTOR G) noexcept;

    XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept;

    // Rotates about y-axis (Yaw), then x-axis (Pitch), then z-axis (Roll)
    XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll) noexcept;

    // Rotates about y-axis (Angles.y), then x-axis (Angles.x), then z-axis (Angles.z)
    XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles) noexcept;

    XMVECTOR XM_CALLCONV XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle) noexcept;
    XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept;

    void XM_CALLCONV XMQuaternionToAxisAngle( XMVECTOR *pAxis,  float *pAngle,  FXMVECTOR Q) noexcept;

    /****************************************************************************
     *
     * Plane operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;
    bool XM_CALLCONV XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon) noexcept;
    bool XM_CALLCONV XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2) noexcept;

    bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept;
    bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept;

    XMVECTOR XM_CALLCONV XMPlaneDot(FXMVECTOR P, FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2) noexcept;
    void XM_CALLCONV XMPlaneIntersectPlane( XMVECTOR *pLinePoint1,  XMVECTOR *pLinePoint2,  FXMVECTOR P1,  FXMVECTOR P2) noexcept;

    // Transforms a plane given an inverse transpose matrix
    XMVECTOR XM_CALLCONV XMPlaneTransform(FXMVECTOR P, FXMMATRIX ITM) noexcept;

    // Transforms an array of planes given an inverse transpose matrix
    XMFLOAT4 *XM_CALLCONV XMPlaneTransformStream( XMFLOAT4 *pOutputStream,
                                                  size_t OutputStride,
                                                  const XMFLOAT4 *pInputStream,
                                                  size_t InputStride,  size_t PlaneCount,  FXMMATRIX ITM) noexcept;

    XMVECTOR XM_CALLCONV XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal) noexcept;
    XMVECTOR XM_CALLCONV XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3) noexcept;

    /****************************************************************************
     *
     * Color operations
     *
     ****************************************************************************/

    bool XM_CALLCONV XMColorEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    bool XM_CALLCONV XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    bool XM_CALLCONV XMColorGreater(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    bool XM_CALLCONV XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    bool XM_CALLCONV XMColorLess(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    bool XM_CALLCONV XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2) noexcept;

    bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept;
    bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept;

    XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR C) noexcept;
    XMVECTOR XM_CALLCONV XMColorModulate(FXMVECTOR C1, FXMVECTOR C2) noexcept;
    XMVECTOR XM_CALLCONV XMColorAdjustSaturation(FXMVECTOR C, float Saturation) noexcept;
    XMVECTOR XM_CALLCONV XMColorAdjustContrast(FXMVECTOR C, float Contrast) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept;

    XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept;
    XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept;

    XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept;
    XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept;

    /****************************************************************************
     *
     * Miscellaneous operations
     *
     ****************************************************************************/

    bool XMVerifyCPUSupport() noexcept;

    XMVECTOR XM_CALLCONV XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex) noexcept;

    bool XMScalarNearEqual(float S1, float S2, float Epsilon) noexcept;
    float XMScalarModAngle(float Value) noexcept;

    float XMScalarSin(float Value) noexcept;
    float XMScalarSinEst(float Value) noexcept;

    float XMScalarCos(float Value) noexcept;
    float XMScalarCosEst(float Value) noexcept;

    void XMScalarSinCos( float *pSin,  float *pCos, float Value) noexcept;
    void XMScalarSinCosEst( float *pSin,  float *pCos, float Value) noexcept;

    float XMScalarASin(float Value) noexcept;
    float XMScalarASinEst(float Value) noexcept;

    float XMScalarACos(float Value) noexcept;
    float XMScalarACosEst(float Value) noexcept;

    /****************************************************************************
     *
     * Templates
     *
     ****************************************************************************/

#if defined(__XNAMATH_H__) && defined(XMMin)
#undef XMMin
#undef XMMax
#endif

    template <class T>
    inline T XMMin(T a, T b) noexcept { return (a < b) ? a : b; }
    template <class T>
    inline T XMMax(T a, T b) noexcept { return (a > b) ? a : b; }

    //------------------------------------------------------------------------------

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

    // PermuteHelper internal template (SSE only)
    namespace Internal
    {
        // Slow path fallback for permutes that do not map to a single SSE shuffle opcode.
        template <uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW>
        struct PermuteHelper
        {
            static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept
            {
                static const XMVECTORU32 selectMask =
                    {{{
                        WhichX ? 0xFFFFFFFF : 0,
                        WhichY ? 0xFFFFFFFF : 0,
                        WhichZ ? 0xFFFFFFFF : 0,
                        WhichW ? 0xFFFFFFFF : 0,
                    }}};

                XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle);
                XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle);

                XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
                XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);

                return _mm_or_ps(masked1, masked2);
            }
        };

        // Fast path for permutes that only read from the first vector.
        template <uint32_t Shuffle>
        struct PermuteHelper<Shuffle, false, false, false, false>
        {
            static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR) noexcept { return XM_PERMUTE_PS(v1, Shuffle); }
        };

        // Fast path for permutes that only read from the second vector.
        template <uint32_t Shuffle>
        struct PermuteHelper<Shuffle, true, true, true, true>
        {
            static XMVECTOR XM_CALLCONV Permute(FXMVECTOR, FXMVECTOR v2) noexcept { return XM_PERMUTE_PS(v2, Shuffle); }
        };

        // Fast path for permutes that read XY from the first vector, ZW from the second.
        template <uint32_t Shuffle>
        struct PermuteHelper<Shuffle, false, false, true, true>
        {
            static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v1, v2, Shuffle); }
        };

        // Fast path for permutes that read XY from the second vector, ZW from the first.
        template <uint32_t Shuffle>
        struct PermuteHelper<Shuffle, true, true, false, false>
        {
            static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) noexcept { return _mm_shuffle_ps(v2, v1, Shuffle); }
        };
    }

#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_

    // General permute template
    template <uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
    inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) noexcept
    {
        static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
        static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
        static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
        static_assert(PermuteW <= 7, "PermuteW template parameter out of range");

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
        constexpr uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);

        constexpr bool WhichX = PermuteX > 3;
        constexpr bool WhichY = PermuteY > 3;
        constexpr bool WhichZ = PermuteZ > 3;
        constexpr bool WhichW = PermuteW > 3;

        return Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
#else

        return XMVectorPermute(V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW);

#endif
    }

    // Special-case permute templates
    template <>
    constexpr XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) noexcept { return V1; }
    template <>
    constexpr XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) noexcept { return V2; }

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movelh_ps(V1, V2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<6, 7, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_movehl_ps(V1, V2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpacklo_ps(V1, V2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_unpackhi_ps(V1, V2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(V1), _mm_castps_pd(V2))); }
#endif

#if defined(_XM_SSE4_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x1); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x3); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x4); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x5); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x6); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x7); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x8); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0x9); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xA); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xB); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xC); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xD); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return _mm_blend_ps(V1, V2, 0xE); }
#endif

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

    // If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead
    // The mirror cases are not spelled out here as the programmer can always swap the arguments
    // (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector)

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_low_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_low_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_low_f32(V2))); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vget_high_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_high_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_high_f32(V2))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_high_f32(V2))); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vget_high_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vget_high_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_low_f32(V1), vrev64_f32(vget_high_f32(V2))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 0, 7, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V1)), vrev64_f32(vget_high_f32(V2))); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vget_low_f32(V2)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vget_high_f32(V1), vrev64_f32(vget_low_f32(V2))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 2, 5, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V1)), vrev64_f32(vget_low_f32(V2))); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 2, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 5, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vtrnq_f32(V1, V2).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 4, 1, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 6, 3, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vzipq_f32(V1, V2).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 2, 4, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 3, 5, 7>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vuzpq_f32(V1, V2).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<1, 2, 3, 4>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 1); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<2, 3, 4, 5>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorPermute<3, 4, 5, 6>(FXMVECTOR V1, FXMVECTOR V2) noexcept { return vextq_f32(V1, V2, 3); }

#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_

    //------------------------------------------------------------------------------

    // General swizzle template
    template <uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V) noexcept
    {
        static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
        static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
        static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
        static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
        return XM_PERMUTE_PS(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
#else

        return XMVectorSwizzle(V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);

#endif
    }

    // Specialized swizzles
    template <>
    constexpr XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) noexcept { return V; }

#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept { return _mm_movelh_ps(V, V); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept { return _mm_movehl_ps(V, V); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return _mm_unpacklo_ps(V, V); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return _mm_unpackhi_ps(V, V); }
#endif

#if defined(_XM_SSE3_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return _mm_moveldup_ps(V); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return _mm_movehdup_ps(V); }
#endif

#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return _mm_broadcastss_ps(V); }
#endif

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 0, 0>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 0); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 1, 1>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_low_f32(V), 1); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 2, 2>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 0); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 3, 3, 3>(FXMVECTOR V) noexcept { return vdupq_lane_f32(vget_high_f32(V), 1); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 3, 2>(FXMVECTOR V) noexcept { return vrev64q_f32(V); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 0, 1>(FXMVECTOR V) noexcept
    {
        float32x2_t vt = vget_low_f32(V);
        return vcombine_f32(vt, vt);
    }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 2, 3>(FXMVECTOR V) noexcept
    {
        float32x2_t vt = vget_high_f32(V);
        return vcombine_f32(vt, vt);
    }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 1, 0>(FXMVECTOR V) noexcept
    {
        float32x2_t vt = vrev64_f32(vget_low_f32(V));
        return vcombine_f32(vt, vt);
    }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 3, 2>(FXMVECTOR V) noexcept
    {
        float32x2_t vt = vrev64_f32(vget_high_f32(V));
        return vcombine_f32(vt, vt);
    }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 3, 2>(FXMVECTOR V) noexcept { return vcombine_f32(vget_low_f32(V), vrev64_f32(vget_high_f32(V))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 0, 2, 3>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_low_f32(V)), vget_high_f32(V)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vget_high_f32(V), vrev64_f32(vget_low_f32(V))); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 0, 1>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vget_low_f32(V)); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 2, 1, 0>(FXMVECTOR V) noexcept { return vcombine_f32(vrev64_f32(vget_high_f32(V)), vrev64_f32(vget_low_f32(V))); }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) noexcept { return vtrnq_f32(V, V).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 1, 1>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 2, 3, 3>(FXMVECTOR V) noexcept { return vzipq_f32(V, V).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 2, 0, 2>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[0]; }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 3, 1, 3>(FXMVECTOR V) noexcept { return vuzpq_f32(V, V).val[1]; }

    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 2, 3, 0>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 1); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<2, 3, 0, 1>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 2); }
    template <>
    inline XMVECTOR XM_CALLCONV XMVectorSwizzle<3, 0, 1, 2>(FXMVECTOR V) noexcept { return vextq_f32(V, V, 3); }

#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_

    //------------------------------------------------------------------------------

    template <uint32_t Elements>
    inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) noexcept
    {
        static_assert(Elements < 4, "Elements template parameter out of range");
        return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
    }

    template <uint32_t Elements>
    inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V) noexcept
    {
        static_assert(Elements < 4, "Elements template parameter out of range");
        return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
    }

    template <uint32_t Elements>
    inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V) noexcept
    {
        static_assert(Elements < 4, "Elements template parameter out of range");
        return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
    }

    template <uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3>
    inline XMVECTOR XM_CALLCONV XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) noexcept
    {
        XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1);
        return XMVectorSelect(VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control);
    }

    /****************************************************************************
     *
     * Globals
     *
     ****************************************************************************/

    // The purpose of the following global constants is to prevent redundant
    // reloading of the constants when they are referenced by more than one
    // separate inline math routine called within the same function.  Declaring
    // a constant locally within a routine is sufficient to prevent redundant
    // reloads of that constant when that single routine is called multiple
    // times in a function, but if the constant is used (and declared) in a
    // separate math routine it would be reloaded.

#ifndef XMGLOBALCONST
#if defined(__GNUC__) && !defined(__MINGW32__)
#define XMGLOBALCONST extern const __attribute__((weak))
#else
#define XMGLOBALCONST extern const __declspec(selectany)
#endif
#endif

    XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {{{-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f}}};
    XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {{{-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/}}};
    XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {{{-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f}}};
    XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {{{-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/}}};
    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {{{1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}}};
    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {{{2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}}};
    XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {{{5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}}};
    XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = {{{+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f}}};
    XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = {{{+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f}}};
    XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {{{-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f}}};
    XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {{{-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f}}};
    XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {{{+0.999866f, +0.999866f, +0.999866f, +0.999866f}}};
    XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {{{-0.3302995f, +0.180141f, -0.085133f, +0.0208351f}}};
    XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {{{2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}}};
    XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = {{{+1.5707288f, -0.2121144f, +0.0742610f, -0.0187293f}}};
    XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {{{XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {{{1.0f, 0.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {{{0.0f, 1.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {{{0.0f, 0.0f, 1.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {{{0.0f, 0.0f, 0.0f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {{{-1.0f, 0.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {{{0.0f, -1.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {{{0.0f, 0.0f, -1.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {{{0.0f, 0.0f, 0.0f, -1.0f}}};
    XMGLOBALCONST XMVECTORU32 g_XMNegativeZero = {{{0x80000000, 0x80000000, 0x80000000, 0x80000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMNegate3 = {{{0x80000000, 0x80000000, 0x80000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskXY = {{{0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMask3 = {{{0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskX = {{{0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskY = {{{0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskZ = {{{0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskW = {{{0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}}};
    XMGLOBALCONST XMVECTORF32 g_XMOne = {{{1.0f, 1.0f, 1.0f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMOne3 = {{{1.0f, 1.0f, 1.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMZero = {{{0.0f, 0.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMTwo = {{{2.f, 2.f, 2.f, 2.f}}};
    XMGLOBALCONST XMVECTORF32 g_XMFour = {{{4.f, 4.f, 4.f, 4.f}}};
    XMGLOBALCONST XMVECTORF32 g_XMSix = {{{6.f, 6.f, 6.f, 6.f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {{{-1.0f, -1.0f, -1.0f, -1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMOneHalf = {{{0.5f, 0.5f, 0.5f, 0.5f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {{{-0.5f, -0.5f, -0.5f, -0.5f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {{{-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {{{-XM_PI, -XM_PI, -XM_PI, -XM_PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {{{XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}}};
    XMGLOBALCONST XMVECTORF32 g_XMPi = {{{XM_PI, XM_PI, XM_PI, XM_PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {{{XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}}};
    XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {{{XM_2PI, XM_2PI, XM_2PI, XM_2PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {{{XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}}};
    XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {{{1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}}};
    XMGLOBALCONST XMVECTORI32 g_XMInfinity = {{{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}}};
    XMGLOBALCONST XMVECTORI32 g_XMQNaN = {{{0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}}};
    XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {{{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}}};
    XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {{{0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}}};
    XMGLOBALCONST XMVECTORI32 g_XMFltMin = {{{0x00800000, 0x00800000, 0x00800000, 0x00800000}}};
    XMGLOBALCONST XMVECTORI32 g_XMFltMax = {{{0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}}};
    XMGLOBALCONST XMVECTORU32 g_XMNegOneMask = {{{0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskA8R8G8B8 = {{{0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipA8R8G8B8 = {{{0x00000000, 0x00000000, 0x00000000, 0x80000000}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {{{0.0f, 0.0f, 0.0f, float(0x80000000U)}}};
    XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {{{1.0f / (255.0f * float(0x10000)), 1.0f / (255.0f * float(0x100)), 1.0f / 255.0f, 1.0f / (255.0f * float(0x1000000))}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskA2B10G10R10 = {{{0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipA2B10G10R10 = {{{0x00000200, 0x00080000, 0x20000000, 0x80000000}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {{{-512.0f, -512.0f * float(0x400), -512.0f * float(0x100000), float(0x80000000U)}}};
    XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {{{1.0f / 511.0f, 1.0f / (511.0f * float(0x400)), 1.0f / (511.0f * float(0x100000)), 1.0f / (3.0f * float(0x40000000))}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16 = {{{0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {{{0x00008000, 0x00000000, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {{{-32768.0f, 0.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {{{1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskX16Y16Z16W16 = {{{0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}}};
    XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {{{0x00008000, 0x00008000, 0x00000000, 0x00000000}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {{{-32768.0f, -32768.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {{{1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 1.0f / (32767.0f * 65536.0f)}}};
    XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {{{8388608.0f, 8388608.0f, 8388608.0f, 8388608.0f}}};
    XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {{{0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegateX = {{{-1.0f, 1.0f, 1.0f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegateY = {{{1.0f, -1.0f, 1.0f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegateZ = {{{1.0f, 1.0f, -1.0f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMNegateW = {{{1.0f, 1.0f, 1.0f, -1.0f}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect0101 = {{{XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect1010 = {{{XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}}};
    XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = {{{0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect1000 = {{{XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect1100 = {{{XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect1110 = {{{XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}}};
    XMGLOBALCONST XMVECTORU32 g_XMSelect1011 = {{{XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {{{1.0f, 1.0f / 65536.0f, 0.0f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {{{1.0f, 1.0f, 1.0f / 65536.0f, 1.0f / 65536.0f}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipY = {{{0, 0x80000000, 0, 0}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipZ = {{{0, 0, 0x80000000, 0}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipW = {{{0, 0, 0, 0x80000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipYZ = {{{0, 0x80000000, 0x80000000, 0}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipZW = {{{0, 0, 0x80000000, 0x80000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMFlipYW = {{{0, 0x80000000, 0, 0x80000000}}};
    XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {{{0x3FF, 0x3FF << 10, 0x3FF << 20, static_cast<int>(0xC0000000)}}};
    XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {{{0x200, 0x200 << 10, 0x200 << 20, 0}}};
    XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {{{0, 0, 0, 32768.0f * 65536.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {{{-512.0f, -512.0f * 1024.0f, -512.0f * 1024.0f * 1024.0f, 0}}};
    XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {{{1.0f, 1.0f / 1024.0f, 1.0f / (1024.0f * 1024.0f), 1.0f / (1024.0f * 1024.0f * 1024.0f)}}};
    XMGLOBALCONST XMVECTORU32 g_XMMaskByte4 = {{{0xFF, 0xFF00, 0xFF0000, 0xFF000000}}};
    XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {{{0x80, 0x8000, 0x800000, 0x00000000}}};
    XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {{{-128.0f, -128.0f * 256.0f, -128.0f * 65536.0f, 0}}};
    XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {{{32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {{{65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f, 65536.0f * 32768.0f - 128.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {{{65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f, 65536.0f * 65536.0f - 256.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {{{32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f, 32768.0f * 65536.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = {{{12.92f, 12.92f, 12.92f, 1.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMsrgbA = {{{0.055f, 0.055f, 0.055f, 0.0f}}};
    XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = {{{1.055f, 1.055f, 1.055f, 1.0f}}};
    XMGLOBALCONST XMVECTORI32 g_XMExponentBias = {{{127, 127, 127, 127}}};
    XMGLOBALCONST XMVECTORI32 g_XMSubnormalExponent = {{{-126, -126, -126, -126}}};
    XMGLOBALCONST XMVECTORI32 g_XMNumTrailing = {{{23, 23, 23, 23}}};
    XMGLOBALCONST XMVECTORI32 g_XMMinNormal = {{{0x00800000, 0x00800000, 0x00800000, 0x00800000}}};
    XMGLOBALCONST XMVECTORU32 g_XMNegInfinity = {{{0xFF800000, 0xFF800000, 0xFF800000, 0xFF800000}}};
    XMGLOBALCONST XMVECTORU32 g_XMNegQNaN = {{{0xFFC00000, 0xFFC00000, 0xFFC00000, 0xFFC00000}}};
    XMGLOBALCONST XMVECTORI32 g_XMBin128 = {{{0x43000000, 0x43000000, 0x43000000, 0x43000000}}};
    XMGLOBALCONST XMVECTORU32 g_XMBinNeg150 = {{{0xC3160000, 0xC3160000, 0xC3160000, 0xC3160000}}};
    XMGLOBALCONST XMVECTORI32 g_XM253 = {{{253, 253, 253, 253}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst1 = {{{-6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f, -6.93147182e-1f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst2 = {{{+2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f, +2.40226462e-1f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst3 = {{{-5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f, -5.55036440e-2f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst4 = {{{+9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f, +9.61597636e-3f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst5 = {{{-1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f, -1.32823968e-3f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst6 = {{{+1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f, +1.47491097e-4f}}};
    XMGLOBALCONST XMVECTORF32 g_XMExpEst7 = {{{-1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f, -1.08635004e-5f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst0 = {{{+1.442693f, +1.442693f, +1.442693f, +1.442693f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst1 = {{{-0.721242f, -0.721242f, -0.721242f, -0.721242f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst2 = {{{+0.479384f, +0.479384f, +0.479384f, +0.479384f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst3 = {{{-0.350295f, -0.350295f, -0.350295f, -0.350295f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst4 = {{{+0.248590f, +0.248590f, +0.248590f, +0.248590f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst5 = {{{-0.145700f, -0.145700f, -0.145700f, -0.145700f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst6 = {{{+0.057148f, +0.057148f, +0.057148f, +0.057148f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLogEst7 = {{{-0.010578f, -0.010578f, -0.010578f, -0.010578f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLgE = {{{+1.442695f, +1.442695f, +1.442695f, +1.442695f}}};
    XMGLOBALCONST XMVECTORF32 g_XMInvLgE = {{{+6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f, +6.93147182e-1f}}};
    XMGLOBALCONST XMVECTORF32 g_XMLg10 = {{{+3.321928f, +3.321928f, +3.321928f, +3.321928f}}};
    XMGLOBALCONST XMVECTORF32 g_XMInvLg10 = {{{+3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f, +3.010299956e-1f}}};
    XMGLOBALCONST XMVECTORF32 g_UByteMax = {{{255.0f, 255.0f, 255.0f, 255.0f}}};
    XMGLOBALCONST XMVECTORF32 g_ByteMin = {{{-127.0f, -127.0f, -127.0f, -127.0f}}};
    XMGLOBALCONST XMVECTORF32 g_ByteMax = {{{127.0f, 127.0f, 127.0f, 127.0f}}};
    XMGLOBALCONST XMVECTORF32 g_ShortMin = {{{-32767.0f, -32767.0f, -32767.0f, -32767.0f}}};
    XMGLOBALCONST XMVECTORF32 g_ShortMax = {{{32767.0f, 32767.0f, 32767.0f, 32767.0f}}};
    XMGLOBALCONST XMVECTORF32 g_UShortMax = {{{65535.0f, 65535.0f, 65535.0f, 65535.0f}}};

    /****************************************************************************
     *
     * Implementation
     *
     ****************************************************************************/

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4068 4214 4204 4365 4616 4640 6001 6101)
    // C4068/4616: ignore unknown pragmas
    // C4214/4204: nonstandard extension used
    // C4365/4640: Off by default noise
    // C6001/6101: False positives
#endif

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes")
#pragma prefast(disable : 26495, "Union initialization confuses /analyze")
#endif

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wfloat-equal"
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#pragma clang diagnostic ignored "-Wunknown-warning-option"
#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
#endif

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) noexcept
    {
#if defined(_XM_NO_INTRINSICS_)
        XMVECTORU32 vResult;
        vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
        vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
        vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
        vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
        return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        XMVECTORU32 vResult;
        vResult.u[0] = (0 - (C0 & 1)) & 0x3F800000;
        vResult.u[1] = (0 - (C1 & 1)) & 0x3F800000;
        vResult.u[2] = (0 - (C2 & 1)) & 0x3F800000;
        vResult.u[3] = (0 - (C3 & 1)) & 0x3F800000;
        return vResult.v;
#else // XM_SSE_INTRINSICS_
        static const XMVECTORU32 g_vMask1 = {{{1, 1, 1, 1}}};
        // Move the parms to a vector
        __m128i vTemp = _mm_set_epi32(static_cast<int>(C3), static_cast<int>(C2), static_cast<int>(C1), static_cast<int>(C0));
        // Mask off the low bits
        vTemp = _mm_and_si128(vTemp, g_vMask1);
        // 0xFFFFFFFF on true bits
        vTemp = _mm_cmpeq_epi32(vTemp, g_vMask1);
        // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
        vTemp = _mm_and_si128(vTemp, g_XMOne);
        return _mm_castsi128_ps(vTemp);
#endif
    }

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) noexcept
    {
        assert(IntConstant >= -16 && IntConstant <= 15);
        assert(DivExponent < 32);
#if defined(_XM_NO_INTRINSICS_)

        using DirectX::XMConvertVectorIntToFloat;

        XMVECTORI32 V = {{{IntConstant, IntConstant, IntConstant, IntConstant}}};
        return XMConvertVectorIntToFloat(V.v, DivExponent);

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        // Splat the int
        int32x4_t vScale = vdupq_n_s32(IntConstant);
        // Convert to a float
        XMVECTOR vResult = vcvtq_f32_s32(vScale);
        // Convert DivExponent into 1.0f/(1<<DivExponent)
        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
        // Splat the scalar value (It's really a float)
        vScale = vreinterpretq_s32_u32(vdupq_n_u32(uScale));
        // Multiply by the reciprocal (Perform a right shift by DivExponent)
        vResult = vmulq_f32(vResult, reinterpret_cast<const float32x4_t *>(&vScale)[0]);
        return vResult;
#else // XM_SSE_INTRINSICS_
      // Splat the int
        __m128i vScale = _mm_set1_epi32(IntConstant);
        // Convert to a float
        XMVECTOR vResult = _mm_cvtepi32_ps(vScale);
        // Convert DivExponent into 1.0f/(1<<DivExponent)
        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
        // Splat the scalar value (It's really a float)
        vScale = _mm_set1_epi32(static_cast<int>(uScale));
        // Multiply by the reciprocal (Perform a right shift by DivExponent)
        vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMVectorSplatConstantInt(int32_t IntConstant) noexcept
    {
        assert(IntConstant >= -16 && IntConstant <= 15);
#if defined(_XM_NO_INTRINSICS_)

        XMVECTORI32 V = {{{IntConstant, IntConstant, IntConstant, IntConstant}}};
        return V.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x4_t V = vdupq_n_s32(IntConstant);
        return reinterpret_cast<float32x4_t *>(&V)[0];
#else // XM_SSE_INTRINSICS_
        __m128i V = _mm_set1_epi32(IntConstant);
        return _mm_castsi128_ps(V);
#endif
    }

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4701)
// C4701: false positives
#endif

    inline XMVECTOR XM_CALLCONV XMConvertVectorIntToFloat(
        FXMVECTOR VInt,
        uint32_t DivExponent) noexcept
    {
        assert(DivExponent < 32);
#if defined(_XM_NO_INTRINSICS_)
        float fScale = 1.0f / static_cast<float>(1U << DivExponent);
        uint32_t ElementIndex = 0;
        XMVECTOR Result;
        do
        {
            auto iTemp = static_cast<int32_t>(VInt.vector4_u32[ElementIndex]);
            Result.vector4_f32[ElementIndex] = static_cast<float>(iTemp) * fScale;
        } while (++ElementIndex < 4);
        return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float fScale = 1.0f / static_cast<float>(1U << DivExponent);
        float32x4_t vResult = vcvtq_f32_s32(vreinterpretq_s32_f32(VInt));
        return vmulq_n_f32(vResult, fScale);
#else // _XM_SSE_INTRINSICS_
        // Convert to floats
        XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
        // Convert DivExponent into 1.0f/(1<<DivExponent)
        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
        // Splat the scalar value
        __m128i vScale = _mm_set1_epi32(static_cast<int>(uScale));
        vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(vScale));
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToInt(
        FXMVECTOR VFloat,
        uint32_t MulExponent) noexcept
    {
        assert(MulExponent < 32);
#if defined(_XM_NO_INTRINSICS_)
        // Get the scalar factor.
        auto fScale = static_cast<float>(1U << MulExponent);
        uint32_t ElementIndex = 0;
        XMVECTOR Result;
        do
        {
            int32_t iResult;
            float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
            if (fTemp <= -(65536.0f * 32768.0f))
            {
                iResult = (-0x7FFFFFFF) - 1;
            }
            else if (fTemp > (65536.0f * 32768.0f) - 128.0f)
            {
                iResult = 0x7FFFFFFF;
            }
            else
            {
                iResult = static_cast<int32_t>(fTemp);
            }
            Result.vector4_u32[ElementIndex] = static_cast<uint32_t>(iResult);
        } while (++ElementIndex < 4);
        return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t vResult = vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
        // In case of positive overflow, detect it
        uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxInt);
        // Float to int conversion
        int32x4_t vResulti = vcvtq_s32_f32(vResult);
        // If there was positive overflow, set to 0x7FFFFFFF
        vResult = vreinterpretq_f32_u32(vandq_u32(vOverflow, g_XMAbsMask));
        vOverflow = vbicq_u32(vreinterpretq_u32_s32(vResulti), vOverflow);
        vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
        return vreinterpretq_f32_u32(vOverflow);
#else // _XM_SSE_INTRINSICS_
        XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
        vResult = _mm_mul_ps(vResult, VFloat);
        // In case of positive overflow, detect it
        XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxInt);
        // Float to int conversion
        __m128i vResulti = _mm_cvttps_epi32(vResult);
        // If there was positive overflow, set to 0x7FFFFFFF
        vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
        vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
        vOverflow = _mm_or_ps(vOverflow, vResult);
        return vOverflow;
#endif
    }

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMConvertVectorUIntToFloat(
        FXMVECTOR VUInt,
        uint32_t DivExponent) noexcept
    {
        assert(DivExponent < 32);
#if defined(_XM_NO_INTRINSICS_)
        float fScale = 1.0f / static_cast<float>(1U << DivExponent);
        uint32_t ElementIndex = 0;
        XMVECTOR Result;
        do
        {
            Result.vector4_f32[ElementIndex] = static_cast<float>(VUInt.vector4_u32[ElementIndex]) * fScale;
        } while (++ElementIndex < 4);
        return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float fScale = 1.0f / static_cast<float>(1U << DivExponent);
        float32x4_t vResult = vcvtq_f32_u32(vreinterpretq_u32_f32(VUInt));
        return vmulq_n_f32(vResult, fScale);
#else // _XM_SSE_INTRINSICS_
        // For the values that are higher than 0x7FFFFFFF, a fixup is needed
        // Determine which ones need the fix.
        XMVECTOR vMask = _mm_and_ps(VUInt, g_XMNegativeZero);
        // Force all values positive
        XMVECTOR vResult = _mm_xor_ps(VUInt, vMask);
        // Convert to floats
        vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
        // Convert 0x80000000 -> 0xFFFFFFFF
        __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
        // For only the ones that are too big, add the fixup
        vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
        vResult = _mm_add_ps(vResult, vMask);
        // Convert DivExponent into 1.0f/(1<<DivExponent)
        uint32_t uScale = 0x3F800000U - (DivExponent << 23);
        // Splat
        iMask = _mm_set1_epi32(static_cast<int>(uScale));
        vResult = _mm_mul_ps(vResult, _mm_castsi128_ps(iMask));
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------

    inline XMVECTOR XM_CALLCONV XMConvertVectorFloatToUInt(
        FXMVECTOR VFloat,
        uint32_t MulExponent) noexcept
    {
        assert(MulExponent < 32);
#if defined(_XM_NO_INTRINSICS_)
        // Get the scalar factor.
        auto fScale = static_cast<float>(1U << MulExponent);
        uint32_t ElementIndex = 0;
        XMVECTOR Result;
        do
        {
            uint32_t uResult;
            float fTemp = VFloat.vector4_f32[ElementIndex] * fScale;
            if (fTemp <= 0.0f)
            {
                uResult = 0;
            }
            else if (fTemp >= (65536.0f * 65536.0f))
            {
                uResult = 0xFFFFFFFFU;
            }
            else
            {
                uResult = static_cast<uint32_t>(fTemp);
            }
            Result.vector4_u32[ElementIndex] = uResult;
        } while (++ElementIndex < 4);
        return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t vResult = vmulq_n_f32(VFloat, static_cast<float>(1U << MulExponent));
        // In case of overflow, detect it
        uint32x4_t vOverflow = vcgtq_f32(vResult, g_XMMaxUInt);
        // Float to int conversion
        uint32x4_t vResulti = vcvtq_u32_f32(vResult);
        // If there was overflow, set to 0xFFFFFFFFU
        vResult = vreinterpretq_f32_u32(vbicq_u32(vResulti, vOverflow));
        vOverflow = vorrq_u32(vOverflow, vreinterpretq_u32_f32(vResult));
        return vreinterpretq_f32_u32(vOverflow);
#else // _XM_SSE_INTRINSICS_
        XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
        vResult = _mm_mul_ps(vResult, VFloat);
        // Clamp to >=0
        vResult = _mm_max_ps(vResult, g_XMZero);
        // Any numbers that are too big, set to 0xFFFFFFFFU
        XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
        XMVECTOR vValue = g_XMUnsignedFix;
        // Too large for a signed integer?
        XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
        // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
        vValue = _mm_and_ps(vValue, vMask);
        // Perform fixup only on numbers too large (Keeps low bit precision)
        vResult = _mm_sub_ps(vResult, vValue);
        __m128i vResulti = _mm_cvttps_epi32(vResult);
        // Convert from signed to unsigned pnly if greater than 0x80000000
        vMask = _mm_and_ps(vMask, g_XMNegativeZero);
        vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
        // On those that are too large, set to 0xFFFFFFFF
        vResult = _mm_or_ps(vResult, vOverflow);
        return vResult;
#endif
    }

#ifdef _MSC_VER
#pragma warning(pop)
#endif

    /****************************************************************************
     *
     * Vector and matrix load operations
     *
     ****************************************************************************/

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt(const uint32_t *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = *pSource;
        V.vector4_u32[1] = 0;
        V.vector4_u32[2] = 0;
        V.vector4_u32[3] = 0;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x4_t zero = vdupq_n_u32(0);
        return vreinterpretq_f32_u32(vld1q_lane_u32(pSource, zero, 0));
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_load_ss(reinterpret_cast<const float *>(pSource));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat(const float *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = *pSource;
        V.vector4_f32[1] = 0.f;
        V.vector4_f32[2] = 0.f;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t zero = vdupq_n_f32(0);
        return vld1q_lane_f32(pSource, zero, 0);
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_load_ss(pSource);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt2(const uint32_t *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = 0;
        V.vector4_u32[3] = 0;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t x = vld1_u32(pSource);
        uint32x2_t zero = vdup_n_u32(0);
        return vreinterpretq_f32_u32(vcombine_u32(x, zero));
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt2A(const uint32_t *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = 0;
        V.vector4_u32[3] = 0;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        uint32x2_t x = vld1_u32_ex(pSource, 64);
#else
        uint32x2_t x = vld1_u32(pSource);
#endif
        uint32x2_t zero = vdup_n_u32(0);
        return vreinterpretq_f32_u32(vcombine_u32(x, zero));
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat2(const XMFLOAT2 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = 0.f;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t x = vld1_f32(reinterpret_cast<const float *>(pSource));
        float32x2_t zero = vdup_n_f32(0);
        return vcombine_f32(x, zero);
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat2A(const XMFLOAT2A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = 0.f;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        float32x2_t x = vld1_f32_ex(reinterpret_cast<const float *>(pSource), 64);
#else
        float32x2_t x = vld1_f32(reinterpret_cast<const float *>(pSource));
#endif
        float32x2_t zero = vdup_n_f32(0);
        return vcombine_f32(x, zero);
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadSInt2(const XMINT2 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = 0.f;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x2_t x = vld1_s32(reinterpret_cast<const int32_t *>(pSource));
        float32x2_t v = vcvt_f32_s32(x);
        float32x2_t zero = vdup_n_f32(0);
        return vcombine_f32(v, zero);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        return _mm_cvtepi32_ps(_mm_castps_si128(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadUInt2(const XMUINT2 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = 0.f;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t *>(pSource));
        float32x2_t v = vcvt_f32_u32(x);
        float32x2_t zero = vdup_n_f32(0);
        return vcombine_f32(v, zero);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 V = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        // For the values that are higher than 0x7FFFFFFF, a fixup is needed
        // Determine which ones need the fix.
        XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
        // Force all values positive
        XMVECTOR vResult = _mm_xor_ps(V, vMask);
        // Convert to floats
        vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
        // Convert 0x80000000 -> 0xFFFFFFFF
        __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
        // For only the ones that are too big, add the fixup
        vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
        vResult = _mm_add_ps(vResult, vMask);
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt3(const uint32_t *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = pSource[2];
        V.vector4_u32[3] = 0;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t x = vld1_u32(pSource);
        uint32x2_t zero = vdup_n_u32(0);
        uint32x2_t y = vld1_lane_u32(pSource + 2, zero, 0);
        return vreinterpretq_f32_u32(vcombine_u32(x, y));
#elif defined(_XM_SSE4_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(pSource + 2));
        return _mm_insert_ps(xy, z, 0x20);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(pSource + 2));
        return _mm_movelh_ps(xy, z);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt3A(const uint32_t *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = pSource[2];
        V.vector4_u32[3] = 0;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        // Reads an extra integer which is zero'd
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        uint32x4_t V = vld1q_u32_ex(pSource, 128);
#else
        uint32x4_t V = vld1q_u32(pSource);
#endif
        return vreinterpretq_f32_u32(vsetq_lane_u32(0, V, 3));
#elif defined(_XM_SSE4_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(pSource + 2));
        return _mm_insert_ps(xy, z, 0x20);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(pSource + 2));
        return _mm_movelh_ps(xy, z);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat3(const XMFLOAT3 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = pSource->z;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t x = vld1_f32(reinterpret_cast<const float *>(pSource));
        float32x2_t zero = vdup_n_f32(0);
        float32x2_t y = vld1_lane_f32(reinterpret_cast<const float *>(pSource) + 2, zero, 0);
        return vcombine_f32(x, y);
#elif defined(_XM_SSE4_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(&pSource->z);
        return _mm_insert_ps(xy, z, 0x20);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(&pSource->z);
        return _mm_movelh_ps(xy, z);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = pSource->z;
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        // Reads an extra float which is zero'd
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        float32x4_t V = vld1q_f32_ex(reinterpret_cast<const float *>(pSource), 128);
#else
        float32x4_t V = vld1q_f32(reinterpret_cast<const float *>(pSource));
#endif
        return vsetq_lane_f32(0, V, 3);
#elif defined(_XM_SSE_INTRINSICS_)
        // Reads an extra float which is zero'd
        __m128 V = _mm_load_ps(&pSource->x);
        return _mm_and_ps(V, g_XMMask3);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadSInt3(const XMINT3 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = static_cast<float>(pSource->z);
        V.vector4_f32[3] = 0.f;
        return V;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x2_t x = vld1_s32(reinterpret_cast<const int32_t *>(pSource));
        int32x2_t zero = vdup_n_s32(0);
        int32x2_t y = vld1_lane_s32(reinterpret_cast<const int32_t *>(pSource) + 2, zero, 0);
        int32x4_t v = vcombine_s32(x, y);
        return vcvtq_f32_s32(v);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(&pSource->z));
        __m128 V = _mm_movelh_ps(xy, z);
        return _mm_cvtepi32_ps(_mm_castps_si128(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadUInt3(const XMUINT3 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = static_cast<float>(pSource->z);
        V.vector4_f32[3] = 0.f;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t x = vld1_u32(reinterpret_cast<const uint32_t *>(pSource));
        uint32x2_t zero = vdup_n_u32(0);
        uint32x2_t y = vld1_lane_u32(reinterpret_cast<const uint32_t *>(pSource) + 2, zero, 0);
        uint32x4_t v = vcombine_u32(x, y);
        return vcvtq_f32_u32(v);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(pSource)));
        __m128 z = _mm_load_ss(reinterpret_cast<const float *>(&pSource->z));
        __m128 V = _mm_movelh_ps(xy, z);
        // For the values that are higher than 0x7FFFFFFF, a fixup is needed
        // Determine which ones need the fix.
        XMVECTOR vMask = _mm_and_ps(V, g_XMNegativeZero);
        // Force all values positive
        XMVECTOR vResult = _mm_xor_ps(V, vMask);
        // Convert to floats
        vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
        // Convert 0x80000000 -> 0xFFFFFFFF
        __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
        // For only the ones that are too big, add the fixup
        vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
        vResult = _mm_add_ps(vResult, vMask);
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt4(const uint32_t *pSource) noexcept
    {
        assert(pSource);

#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = pSource[2];
        V.vector4_u32[3] = pSource[3];
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        return vreinterpretq_f32_u32(vld1q_u32(pSource));
#elif defined(_XM_SSE_INTRINSICS_)
        __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pSource));
        return _mm_castsi128_ps(V);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadInt4A(const uint32_t *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_u32[0] = pSource[0];
        V.vector4_u32[1] = pSource[1];
        V.vector4_u32[2] = pSource[2];
        V.vector4_u32[3] = pSource[3];
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        return vld1q_u32_ex(pSource, 128);
#else
        return vreinterpretq_f32_u32(vld1q_u32(pSource));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        __m128i V = _mm_load_si128(reinterpret_cast<const __m128i *>(pSource));
        return _mm_castsi128_ps(V);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat4(const XMFLOAT4 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = pSource->z;
        V.vector4_f32[3] = pSource->w;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        return vld1q_f32(reinterpret_cast<const float *>(pSource));
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_loadu_ps(&pSource->x);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadFloat4A(const XMFLOAT4A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = pSource->x;
        V.vector4_f32[1] = pSource->y;
        V.vector4_f32[2] = pSource->z;
        V.vector4_f32[3] = pSource->w;
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        return vld1q_f32_ex(reinterpret_cast<const float *>(pSource), 128);
#else
        return vld1q_f32(reinterpret_cast<const float *>(pSource));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        return _mm_load_ps(&pSource->x);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadSInt4(const XMINT4 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = static_cast<float>(pSource->z);
        V.vector4_f32[3] = static_cast<float>(pSource->w);
        return V;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x4_t v = vld1q_s32(reinterpret_cast<const int32_t *>(pSource));
        return vcvtq_f32_s32(v);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pSource));
        return _mm_cvtepi32_ps(V);
#endif
    }

    //------------------------------------------------------------------------------
     inline XMVECTOR XM_CALLCONV XMLoadUInt4(const XMUINT4 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)
        XMVECTOR V;
        V.vector4_f32[0] = static_cast<float>(pSource->x);
        V.vector4_f32[1] = static_cast<float>(pSource->y);
        V.vector4_f32[2] = static_cast<float>(pSource->z);
        V.vector4_f32[3] = static_cast<float>(pSource->w);
        return V;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(pSource));
        return vcvtq_f32_u32(v);
#elif defined(_XM_SSE_INTRINSICS_)
        __m128i V = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pSource));
        // For the values that are higher than 0x7FFFFFFF, a fixup is needed
        // Determine which ones need the fix.
        XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V), g_XMNegativeZero);
        // Force all values positive
        XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V), vMask);
        // Convert to floats
        vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
        // Convert 0x80000000 -> 0xFFFFFFFF
        __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask), 31);
        // For only the ones that are too big, add the fixup
        vMask = _mm_and_ps(_mm_castsi128_ps(iMask), g_XMFixUnsigned);
        vResult = _mm_add_ps(vResult, vMask);
        return vResult;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat3x3(const XMFLOAT3X3 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[0][1];
        M.r[0].vector4_f32[2] = pSource->m[0][2];
        M.r[0].vector4_f32[3] = 0.0f;

        M.r[1].vector4_f32[0] = pSource->m[1][0];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[1][2];
        M.r[1].vector4_f32[3] = 0.0f;

        M.r[2].vector4_f32[0] = pSource->m[2][0];
        M.r[2].vector4_f32[1] = pSource->m[2][1];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = 0.0f;
        M.r[3].vector4_f32[0] = 0.0f;
        M.r[3].vector4_f32[1] = 0.0f;
        M.r[3].vector4_f32[2] = 0.0f;
        M.r[3].vector4_f32[3] = 1.0f;
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
        float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
        float32x2_t v2 = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t *>(&pSource->m[2][2])));
        float32x4_t T = vextq_f32(v0, v1, 3);

        XMMATRIX M;
        M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
        M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMask3));
        M.r[2] = vcombine_f32(vget_high_f32(v1), v2);
        M.r[3] = g_XMIdentityR3;
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        __m128 Z = _mm_setzero_ps();

        __m128 V1 = _mm_loadu_ps(&pSource->m[0][0]);
        __m128 V2 = _mm_loadu_ps(&pSource->m[1][1]);
        __m128 V3 = _mm_load_ss(&pSource->m[2][2]);

        __m128 T1 = _mm_unpackhi_ps(V1, Z);
        __m128 T2 = _mm_unpacklo_ps(V2, Z);
        __m128 T3 = _mm_shuffle_ps(V3, T2, _MM_SHUFFLE(0, 1, 0, 0));
        __m128 T4 = _mm_movehl_ps(T2, T3);
        __m128 T5 = _mm_movehl_ps(Z, T1);

        XMMATRIX M;
        M.r[0] = _mm_movelh_ps(V1, T1);
        M.r[1] = _mm_add_ps(T4, T5);
        M.r[2] = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 3, 2));
        M.r[3] = g_XMIdentityR3;
        return M;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[0][1];
        M.r[0].vector4_f32[2] = pSource->m[0][2];
        M.r[0].vector4_f32[3] = 0.0f;

        M.r[1].vector4_f32[0] = pSource->m[1][0];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[1][2];
        M.r[1].vector4_f32[3] = 0.0f;

        M.r[2].vector4_f32[0] = pSource->m[2][0];
        M.r[2].vector4_f32[1] = pSource->m[2][1];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = 0.0f;

        M.r[3].vector4_f32[0] = pSource->m[3][0];
        M.r[3].vector4_f32[1] = pSource->m[3][1];
        M.r[3].vector4_f32[2] = pSource->m[3][2];
        M.r[3].vector4_f32[3] = 1.0f;
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
        float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
        float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);

        float32x4_t T1 = vextq_f32(v0, v1, 3);
        float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
        float32x4_t T3 = vextq_f32(v2, v2, 1);

        XMMATRIX M;
        M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
        M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
        M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
        M.r[3] = vsetq_lane_f32(1.f, T3, 3);
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        // Use unaligned load instructions to
        // load the 12 floats
        // vTemp1 = x1,y1,z1,x2
        XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
        // vTemp2 = y2,z2,x3,y3
        XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
        // vTemp4 = z3,x4,y4,z4
        XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
        // vTemp3 = x3,y3,z3,z3
        XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
        // vTemp2 = y2,z2,x2,x2
        vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
        // vTemp2 = x2,y2,z2,z2
        vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
        // vTemp1 = x1,y1,z1,0
        vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
        // vTemp2 = x2,y2,z2,0
        vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
        // vTemp3 = x3,y3,z3,0
        vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
        // vTemp4i = x4,y4,z4,0
        __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
        // vTemp4i = x4,y4,z4,1.0f
        vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
        XMMATRIX M(vTemp1,
                   vTemp2,
                   vTemp3,
                   _mm_castsi128_ps(vTemp4i));
        return M;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[0][1];
        M.r[0].vector4_f32[2] = pSource->m[0][2];
        M.r[0].vector4_f32[3] = 0.0f;

        M.r[1].vector4_f32[0] = pSource->m[1][0];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[1][2];
        M.r[1].vector4_f32[3] = 0.0f;

        M.r[2].vector4_f32[0] = pSource->m[2][0];
        M.r[2].vector4_f32[1] = pSource->m[2][1];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = 0.0f;

        M.r[3].vector4_f32[0] = pSource->m[3][0];
        M.r[3].vector4_f32[1] = pSource->m[3][1];
        M.r[3].vector4_f32[2] = pSource->m[3][2];
        M.r[3].vector4_f32[3] = 1.0f;
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        float32x4_t v0 = vld1q_f32_ex(&pSource->m[0][0], 128);
        float32x4_t v1 = vld1q_f32_ex(&pSource->m[1][1], 128);
        float32x4_t v2 = vld1q_f32_ex(&pSource->m[2][2], 128);
#else
        float32x4_t v0 = vld1q_f32(&pSource->m[0][0]);
        float32x4_t v1 = vld1q_f32(&pSource->m[1][1]);
        float32x4_t v2 = vld1q_f32(&pSource->m[2][2]);
#endif

        float32x4_t T1 = vextq_f32(v0, v1, 3);
        float32x4_t T2 = vcombine_f32(vget_high_f32(v1), vget_low_f32(v2));
        float32x4_t T3 = vextq_f32(v2, v2, 1);

        XMMATRIX M;
        M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v0), g_XMMask3));
        M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
        M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
        M.r[3] = vsetq_lane_f32(1.f, T3, 3);
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        // Use aligned load instructions to
        // load the 12 floats
        // vTemp1 = x1,y1,z1,x2
        XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
        // vTemp2 = y2,z2,x3,y3
        XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
        // vTemp4 = z3,x4,y4,z4
        XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
        // vTemp3 = x3,y3,z3,z3
        XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2));
        // vTemp2 = y2,z2,x2,x2
        vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0));
        // vTemp2 = x2,y2,z2,z2
        vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
        // vTemp1 = x1,y1,z1,0
        vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
        // vTemp2 = x2,y2,z2,0
        vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
        // vTemp3 = x3,y3,z3,0
        vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
        // vTemp4i = x4,y4,z4,0
        __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
        // vTemp4i = x4,y4,z4,1.0f
        vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3);
        XMMATRIX M(vTemp1,
                   vTemp2,
                   vTemp3,
                   _mm_castsi128_ps(vTemp4i));
        return M;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat3x4(const XMFLOAT3X4 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[1][0];
        M.r[0].vector4_f32[2] = pSource->m[2][0];
        M.r[0].vector4_f32[3] = 0.0f;

        M.r[1].vector4_f32[0] = pSource->m[0][1];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[2][1];
        M.r[1].vector4_f32[3] = 0.0f;

        M.r[2].vector4_f32[0] = pSource->m[0][2];
        M.r[2].vector4_f32[1] = pSource->m[1][2];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = 0.0f;

        M.r[3].vector4_f32[0] = pSource->m[0][3];
        M.r[3].vector4_f32[1] = pSource->m[1][3];
        M.r[3].vector4_f32[2] = pSource->m[2][3];
        M.r[3].vector4_f32[3] = 1.0f;
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
        float32x4_t vTemp1 = vld1q_f32(&pSource->_31);

        float32x2_t l = vget_low_f32(vTemp1);
        float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
        float32x2_t rl = vrev64_f32(l);
        float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);

        float32x2_t h = vget_high_f32(vTemp1);
        float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
        float32x2_t rh = vrev64_f32(h);
        float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);

        XMMATRIX M = {};
        M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
        M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
        M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
        M.r[3] = vsetq_lane_f32(1.f, T3, 3);
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        XMMATRIX M;
        M.r[0] = _mm_loadu_ps(&pSource->_11);
        M.r[1] = _mm_loadu_ps(&pSource->_21);
        M.r[2] = _mm_loadu_ps(&pSource->_31);
        M.r[3] = g_XMIdentityR3;

        // x.x,x.y,y.x,y.y
        XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
        // x.z,x.w,y.z,y.w
        XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
        // z.x,z.y,w.x,w.y
        XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
        // z.z,z.w,w.z,w.w
        XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
        XMMATRIX mResult;

        // x.x,y.x,z.x,w.x
        mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
        // x.y,y.y,z.y,w.y
        mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
        // x.z,y.z,z.z,w.z
        mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
        // x.w,y.w,z.w,w.w
        mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
        return mResult;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat3x4A(const XMFLOAT3X4A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[1][0];
        M.r[0].vector4_f32[2] = pSource->m[2][0];
        M.r[0].vector4_f32[3] = 0.0f;

        M.r[1].vector4_f32[0] = pSource->m[0][1];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[2][1];
        M.r[1].vector4_f32[3] = 0.0f;

        M.r[2].vector4_f32[0] = pSource->m[0][2];
        M.r[2].vector4_f32[1] = pSource->m[1][2];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = 0.0f;

        M.r[3].vector4_f32[0] = pSource->m[0][3];
        M.r[3].vector4_f32[1] = pSource->m[1][3];
        M.r[3].vector4_f32[2] = pSource->m[2][3];
        M.r[3].vector4_f32[3] = 1.0f;
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        float32x2x4_t vTemp0 = vld4_f32_ex(&pSource->_11, 128);
        float32x4_t vTemp1 = vld1q_f32_ex(&pSource->_31, 128);
#else
        float32x2x4_t vTemp0 = vld4_f32(&pSource->_11);
        float32x4_t vTemp1 = vld1q_f32(&pSource->_31);
#endif

        float32x2_t l = vget_low_f32(vTemp1);
        float32x4_t T0 = vcombine_f32(vTemp0.val[0], l);
        float32x2_t rl = vrev64_f32(l);
        float32x4_t T1 = vcombine_f32(vTemp0.val[1], rl);

        float32x2_t h = vget_high_f32(vTemp1);
        float32x4_t T2 = vcombine_f32(vTemp0.val[2], h);
        float32x2_t rh = vrev64_f32(h);
        float32x4_t T3 = vcombine_f32(vTemp0.val[3], rh);

        XMMATRIX M = {};
        M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T0), g_XMMask3));
        M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T1), g_XMMask3));
        M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3));
        M.r[3] = vsetq_lane_f32(1.f, T3, 3);
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        XMMATRIX M;
        M.r[0] = _mm_load_ps(&pSource->_11);
        M.r[1] = _mm_load_ps(&pSource->_21);
        M.r[2] = _mm_load_ps(&pSource->_31);
        M.r[3] = g_XMIdentityR3;

        // x.x,x.y,y.x,y.y
        XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
        // x.z,x.w,y.z,y.w
        XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
        // z.x,z.y,w.x,w.y
        XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
        // z.z,z.w,w.z,w.w
        XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));
        XMMATRIX mResult;

        // x.x,y.x,z.x,w.x
        mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
        // x.y,y.y,z.y,w.y
        mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
        // x.z,y.z,z.z,w.z
        mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
        // x.w,y.w,z.w,w.w
        mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
        return mResult;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat4x4(const XMFLOAT4X4 *pSource) noexcept
    {
        assert(pSource);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[0][1];
        M.r[0].vector4_f32[2] = pSource->m[0][2];
        M.r[0].vector4_f32[3] = pSource->m[0][3];

        M.r[1].vector4_f32[0] = pSource->m[1][0];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[1][2];
        M.r[1].vector4_f32[3] = pSource->m[1][3];

        M.r[2].vector4_f32[0] = pSource->m[2][0];
        M.r[2].vector4_f32[1] = pSource->m[2][1];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = pSource->m[2][3];

        M.r[3].vector4_f32[0] = pSource->m[3][0];
        M.r[3].vector4_f32[1] = pSource->m[3][1];
        M.r[3].vector4_f32[2] = pSource->m[3][2];
        M.r[3].vector4_f32[3] = pSource->m[3][3];
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        XMMATRIX M;
        M.r[0] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_11));
        M.r[1] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_21));
        M.r[2] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_31));
        M.r[3] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_41));
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        XMMATRIX M;
        M.r[0] = _mm_loadu_ps(&pSource->_11);
        M.r[1] = _mm_loadu_ps(&pSource->_21);
        M.r[2] = _mm_loadu_ps(&pSource->_31);
        M.r[3] = _mm_loadu_ps(&pSource->_41);
        return M;
#endif
    }

    //------------------------------------------------------------------------------
     inline XMMATRIX XM_CALLCONV XMLoadFloat4x4A(const XMFLOAT4X4A *pSource) noexcept
    {
        assert(pSource);
        assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        XMMATRIX M;
        M.r[0].vector4_f32[0] = pSource->m[0][0];
        M.r[0].vector4_f32[1] = pSource->m[0][1];
        M.r[0].vector4_f32[2] = pSource->m[0][2];
        M.r[0].vector4_f32[3] = pSource->m[0][3];

        M.r[1].vector4_f32[0] = pSource->m[1][0];
        M.r[1].vector4_f32[1] = pSource->m[1][1];
        M.r[1].vector4_f32[2] = pSource->m[1][2];
        M.r[1].vector4_f32[3] = pSource->m[1][3];

        M.r[2].vector4_f32[0] = pSource->m[2][0];
        M.r[2].vector4_f32[1] = pSource->m[2][1];
        M.r[2].vector4_f32[2] = pSource->m[2][2];
        M.r[2].vector4_f32[3] = pSource->m[2][3];

        M.r[3].vector4_f32[0] = pSource->m[3][0];
        M.r[3].vector4_f32[1] = pSource->m[3][1];
        M.r[3].vector4_f32[2] = pSource->m[3][2];
        M.r[3].vector4_f32[3] = pSource->m[3][3];
        return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        XMMATRIX M;
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        M.r[0] = vld1q_f32_ex(reinterpret_cast<const float *>(&pSource->_11), 128);
        M.r[1] = vld1q_f32_ex(reinterpret_cast<const float *>(&pSource->_21), 128);
        M.r[2] = vld1q_f32_ex(reinterpret_cast<const float *>(&pSource->_31), 128);
        M.r[3] = vld1q_f32_ex(reinterpret_cast<const float *>(&pSource->_41), 128);
#else
        M.r[0] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_11));
        M.r[1] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_21));
        M.r[2] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_31));
        M.r[3] = vld1q_f32(reinterpret_cast<const float *>(&pSource->_41));
#endif
        return M;
#elif defined(_XM_SSE_INTRINSICS_)
        XMMATRIX M;
        M.r[0] = _mm_load_ps(&pSource->_11);
        M.r[1] = _mm_load_ps(&pSource->_21);
        M.r[2] = _mm_load_ps(&pSource->_31);
        M.r[3] = _mm_load_ps(&pSource->_41);
        return M;
#endif
    }

    /****************************************************************************
     *
     * Vector and matrix store operations
     *
     ****************************************************************************/
     inline void XM_CALLCONV XMStoreInt(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        *pDestination = XMVectorGetIntX(V);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        vst1q_lane_u32(pDestination, *reinterpret_cast<const uint32x4_t *>(&V), 0);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_ss(reinterpret_cast<float *>(pDestination), V);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat(
        float *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        *pDestination = XMVectorGetX(V);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        vst1q_lane_f32(pDestination, V, 0);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_ss(pDestination, V);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt2(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
        vst1_u32(pDestination, VL);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt2A(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1_u32_ex(pDestination, VL, 64);
#else
        vst1_u32(pDestination, VL);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat2(
        XMFLOAT2 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t VL = vget_low_f32(V);
        vst1_f32(reinterpret_cast<float *>(pDestination), VL);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat2A(
        XMFLOAT2A *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t VL = vget_low_f32(V);
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1_f32_ex(reinterpret_cast<float *>(pDestination), VL, 64);
#else
        vst1_f32(reinterpret_cast<float *>(pDestination), VL);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreSInt2(
        XMINT2 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t v = vget_low_f32(V);
        int32x2_t iv = vcvt_s32_f32(v);
        vst1_s32(reinterpret_cast<int32_t *>(pDestination), iv);
#elif defined(_XM_SSE_INTRINSICS_)
        // In case of positive overflow, detect it
        XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
        // Float to int conversion
        __m128i vResulti = _mm_cvttps_epi32(V);
        // If there was positive overflow, set to 0x7FFFFFFF
        XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
        vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
        vOverflow = _mm_or_ps(vOverflow, vResult);
        // Write two ints
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(vOverflow));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreUInt2(
        XMUINT2 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t v = vget_low_f32(V);
        uint32x2_t iv = vcvt_u32_f32(v);
        vst1_u32(reinterpret_cast<uint32_t *>(pDestination), iv);
#elif defined(_XM_SSE_INTRINSICS_)
        // Clamp to >=0
        XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
        // Any numbers that are too big, set to 0xFFFFFFFFU
        XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
        XMVECTOR vValue = g_XMUnsignedFix;
        // Too large for a signed integer?
        XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
        // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
        vValue = _mm_and_ps(vValue, vMask);
        // Perform fixup only on numbers too large (Keeps low bit precision)
        vResult = _mm_sub_ps(vResult, vValue);
        __m128i vResulti = _mm_cvttps_epi32(vResult);
        // Convert from signed to unsigned pnly if greater than 0x80000000
        vMask = _mm_and_ps(vMask, g_XMNegativeZero);
        vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
        // On those that are too large, set to 0xFFFFFFFF
        vResult = _mm_or_ps(vResult, vOverflow);
        // Write two uints
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(vResult));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt3(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
        pDestination[2] = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
        vst1_u32(pDestination, VL);
        vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t *>(&V), 2);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
        __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        _mm_store_ss(reinterpret_cast<float *>(&pDestination[2]), z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt3A(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
        pDestination[2] = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x2_t VL = vget_low_u32(vreinterpretq_u32_f32(V));
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1_u32_ex(pDestination, VL, 64);
#else
        vst1_u32(pDestination, VL);
#endif
        vst1q_lane_u32(pDestination + 2, *reinterpret_cast<const uint32x4_t *>(&V), 2);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
        __m128 z = _mm_movehl_ps(V, V);
        _mm_store_ss(reinterpret_cast<float *>(&pDestination[2]), z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat3(
        XMFLOAT3 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
        pDestination->z = V.vector4_f32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t VL = vget_low_f32(V);
        vst1_f32(reinterpret_cast<float *>(pDestination), VL);
        vst1q_lane_f32(reinterpret_cast<float *>(pDestination) + 2, V, 2);
#elif defined(_XM_SSE4_INTRINSICS_)
        *reinterpret_cast<int *>(&pDestination->x) = _mm_extract_ps(V, 0);
        *reinterpret_cast<int *>(&pDestination->y) = _mm_extract_ps(V, 1);
        *reinterpret_cast<int *>(&pDestination->z) = _mm_extract_ps(V, 2);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
        __m128 z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        _mm_store_ss(&pDestination->z, z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat3A(
        XMFLOAT3A *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
        pDestination->z = V.vector4_f32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x2_t VL = vget_low_f32(V);
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1_f32_ex(reinterpret_cast<float *>(pDestination), VL, 64);
#else
        vst1_f32(reinterpret_cast<float *>(pDestination), VL);
#endif
        vst1q_lane_f32(reinterpret_cast<float *>(pDestination) + 2, V, 2);
#elif defined(_XM_SSE4_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
        *reinterpret_cast<int *>(&pDestination->z) = _mm_extract_ps(V, 2);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(V));
        __m128 z = _mm_movehl_ps(V, V);
        _mm_store_ss(&pDestination->z, z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreSInt3(
        XMINT3 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
        pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x4_t v = vcvtq_s32_f32(V);
        int32x2_t vL = vget_low_s32(v);
        vst1_s32(reinterpret_cast<int32_t *>(pDestination), vL);
        vst1q_lane_s32(reinterpret_cast<int32_t *>(pDestination) + 2, v, 2);
#elif defined(_XM_SSE_INTRINSICS_)
        // In case of positive overflow, detect it
        XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
        // Float to int conversion
        __m128i vResulti = _mm_cvttps_epi32(V);
        // If there was positive overflow, set to 0x7FFFFFFF
        XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
        vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
        vOverflow = _mm_or_ps(vOverflow, vResult);
        // Write 3 uints
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(vOverflow));
        __m128 z = XM_PERMUTE_PS(vOverflow, _MM_SHUFFLE(2, 2, 2, 2));
        _mm_store_ss(reinterpret_cast<float *>(&pDestination->z), z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreUInt3(
        XMUINT3 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
        pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x4_t v = vcvtq_u32_f32(V);
        uint32x2_t vL = vget_low_u32(v);
        vst1_u32(reinterpret_cast<uint32_t *>(pDestination), vL);
        vst1q_lane_u32(reinterpret_cast<uint32_t *>(pDestination) + 2, v, 2);
#elif defined(_XM_SSE_INTRINSICS_)
        // Clamp to >=0
        XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
        // Any numbers that are too big, set to 0xFFFFFFFFU
        XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
        XMVECTOR vValue = g_XMUnsignedFix;
        // Too large for a signed integer?
        XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
        // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
        vValue = _mm_and_ps(vValue, vMask);
        // Perform fixup only on numbers too large (Keeps low bit precision)
        vResult = _mm_sub_ps(vResult, vValue);
        __m128i vResulti = _mm_cvttps_epi32(vResult);
        // Convert from signed to unsigned pnly if greater than 0x80000000
        vMask = _mm_and_ps(vMask, g_XMNegativeZero);
        vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
        // On those that are too large, set to 0xFFFFFFFF
        vResult = _mm_or_ps(vResult, vOverflow);
        // Write 3 uints
        _mm_store_sd(reinterpret_cast<double *>(pDestination), _mm_castps_pd(vResult));
        __m128 z = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(2, 2, 2, 2));
        _mm_store_ss(reinterpret_cast<float *>(&pDestination->z), z);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt4(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
        pDestination[2] = V.vector4_u32[2];
        pDestination[3] = V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDestination), _mm_castps_si128(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreInt4A(
        uint32_t *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination[0] = V.vector4_u32[0];
        pDestination[1] = V.vector4_u32[1];
        pDestination[2] = V.vector4_u32[2];
        pDestination[3] = V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1q_u32_ex(pDestination, V, 128);
#else
        vst1q_u32(pDestination, vreinterpretq_u32_f32(V));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_si128(reinterpret_cast<__m128i *>(pDestination), _mm_castps_si128(V));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4(
        XMFLOAT4 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
        pDestination->z = V.vector4_f32[2];
        pDestination->w = V.vector4_f32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        vst1q_f32(reinterpret_cast<float *>(pDestination), V);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_storeu_ps(&pDestination->x, V);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4A(
        XMFLOAT4A *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = V.vector4_f32[0];
        pDestination->y = V.vector4_f32[1];
        pDestination->z = V.vector4_f32[2];
        pDestination->w = V.vector4_f32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1q_f32_ex(reinterpret_cast<float *>(pDestination), V, 128);
#else
        vst1q_f32(reinterpret_cast<float *>(pDestination), V);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_ps(&pDestination->x, V);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreSInt4(
        XMINT4 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<int32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<int32_t>(V.vector4_f32[1]);
        pDestination->z = static_cast<int32_t>(V.vector4_f32[2]);
        pDestination->w = static_cast<int32_t>(V.vector4_f32[3]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        int32x4_t v = vcvtq_s32_f32(V);
        vst1q_s32(reinterpret_cast<int32_t *>(pDestination), v);
#elif defined(_XM_SSE_INTRINSICS_)
        // In case of positive overflow, detect it
        XMVECTOR vOverflow = _mm_cmpgt_ps(V, g_XMMaxInt);
        // Float to int conversion
        __m128i vResulti = _mm_cvttps_epi32(V);
        // If there was positive overflow, set to 0x7FFFFFFF
        XMVECTOR vResult = _mm_and_ps(vOverflow, g_XMAbsMask);
        vOverflow = _mm_andnot_ps(vOverflow, _mm_castsi128_ps(vResulti));
        vOverflow = _mm_or_ps(vOverflow, vResult);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDestination), _mm_castps_si128(vOverflow));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreUInt4(
        XMUINT4 *pDestination,
        FXMVECTOR V) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)
        pDestination->x = static_cast<uint32_t>(V.vector4_f32[0]);
        pDestination->y = static_cast<uint32_t>(V.vector4_f32[1]);
        pDestination->z = static_cast<uint32_t>(V.vector4_f32[2]);
        pDestination->w = static_cast<uint32_t>(V.vector4_f32[3]);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
        uint32x4_t v = vcvtq_u32_f32(V);
        vst1q_u32(reinterpret_cast<uint32_t *>(pDestination), v);
#elif defined(_XM_SSE_INTRINSICS_)
        // Clamp to >=0
        XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
        // Any numbers that are too big, set to 0xFFFFFFFFU
        XMVECTOR vOverflow = _mm_cmpgt_ps(vResult, g_XMMaxUInt);
        XMVECTOR vValue = g_XMUnsignedFix;
        // Too large for a signed integer?
        XMVECTOR vMask = _mm_cmpge_ps(vResult, vValue);
        // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
        vValue = _mm_and_ps(vValue, vMask);
        // Perform fixup only on numbers too large (Keeps low bit precision)
        vResult = _mm_sub_ps(vResult, vValue);
        __m128i vResulti = _mm_cvttps_epi32(vResult);
        // Convert from signed to unsigned pnly if greater than 0x80000000
        vMask = _mm_and_ps(vMask, g_XMNegativeZero);
        vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti), vMask);
        // On those that are too large, set to 0xFFFFFFFF
        vResult = _mm_or_ps(vResult, vOverflow);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDestination), _mm_castps_si128(vResult));
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat3x3(
        XMFLOAT3X3 *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[0].vector4_f32[1];
        pDestination->m[0][2] = M.r[0].vector4_f32[2];

        pDestination->m[1][0] = M.r[1].vector4_f32[0];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[1].vector4_f32[2];

        pDestination->m[2][0] = M.r[2].vector4_f32[0];
        pDestination->m[2][1] = M.r[2].vector4_f32[1];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
        float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
        vst1q_f32(&pDestination->m[0][0], T2);

        T1 = vextq_f32(M.r[1], M.r[1], 1);
        T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
        vst1q_f32(&pDestination->m[1][1], T2);

        vst1q_lane_f32(&pDestination->m[2][2], M.r[2], 2);
#elif defined(_XM_SSE_INTRINSICS_)
        XMVECTOR vTemp1 = M.r[0];
        XMVECTOR vTemp2 = M.r[1];
        XMVECTOR vTemp3 = M.r[2];
        XMVECTOR vWork = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 0, 2, 2));
        vTemp1 = _mm_shuffle_ps(vTemp1, vWork, _MM_SHUFFLE(2, 0, 1, 0));
        _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
        vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
        _mm_storeu_ps(&pDestination->m[1][1], vTemp2);
        vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(2, 2, 2, 2));
        _mm_store_ss(&pDestination->m[2][2], vTemp3);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4x3(
        XMFLOAT4X3 *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[0].vector4_f32[1];
        pDestination->m[0][2] = M.r[0].vector4_f32[2];

        pDestination->m[1][0] = M.r[1].vector4_f32[0];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[1].vector4_f32[2];

        pDestination->m[2][0] = M.r[2].vector4_f32[0];
        pDestination->m[2][1] = M.r[2].vector4_f32[1];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];

        pDestination->m[3][0] = M.r[3].vector4_f32[0];
        pDestination->m[3][1] = M.r[3].vector4_f32[1];
        pDestination->m[3][2] = M.r[3].vector4_f32[2];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
        float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
        vst1q_f32(&pDestination->m[0][0], T2);

        T1 = vextq_f32(M.r[1], M.r[1], 1);
        T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
        vst1q_f32(&pDestination->m[1][1], T2);

        T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
        T2 = vextq_f32(T1, M.r[3], 3);
        vst1q_f32(&pDestination->m[2][2], T2);
#elif defined(_XM_SSE_INTRINSICS_)
        XMVECTOR vTemp1 = M.r[0];
        XMVECTOR vTemp2 = M.r[1];
        XMVECTOR vTemp3 = M.r[2];
        XMVECTOR vTemp4 = M.r[3];
        XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
        vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(2, 2, 0, 0));
        vTemp1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(0, 2, 1, 0));
        vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
        vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
        _mm_storeu_ps(&pDestination->m[0][0], vTemp1);
        _mm_storeu_ps(&pDestination->m[1][1], vTemp2x);
        _mm_storeu_ps(&pDestination->m[2][2], vTemp3);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4x3A(
        XMFLOAT4X3A *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[0].vector4_f32[1];
        pDestination->m[0][2] = M.r[0].vector4_f32[2];

        pDestination->m[1][0] = M.r[1].vector4_f32[0];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[1].vector4_f32[2];

        pDestination->m[2][0] = M.r[2].vector4_f32[0];
        pDestination->m[2][1] = M.r[2].vector4_f32[1];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];

        pDestination->m[3][0] = M.r[3].vector4_f32[0];
        pDestination->m[3][1] = M.r[3].vector4_f32[1];
        pDestination->m[3][2] = M.r[3].vector4_f32[2];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
        float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
        vst1q_f32_ex(&pDestination->m[0][0], T2, 128);

        T1 = vextq_f32(M.r[1], M.r[1], 1);
        T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
        vst1q_f32_ex(&pDestination->m[1][1], T2, 128);

        T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
        T2 = vextq_f32(T1, M.r[3], 3);
        vst1q_f32_ex(&pDestination->m[2][2], T2, 128);
#else
        float32x4_t T1 = vextq_f32(M.r[0], M.r[1], 1);
        float32x4_t T2 = vbslq_f32(g_XMMask3, M.r[0], T1);
        vst1q_f32(&pDestination->m[0][0], T2);

        T1 = vextq_f32(M.r[1], M.r[1], 1);
        T2 = vcombine_f32(vget_low_f32(T1), vget_low_f32(M.r[2]));
        vst1q_f32(&pDestination->m[1][1], T2);

        T1 = vdupq_lane_f32(vget_high_f32(M.r[2]), 0);
        T2 = vextq_f32(T1, M.r[3], 3);
        vst1q_f32(&pDestination->m[2][2], T2);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        // x1,y1,z1,w1
        XMVECTOR vTemp1 = M.r[0];
        // x2,y2,z2,w2
        XMVECTOR vTemp2 = M.r[1];
        // x3,y3,z3,w3
        XMVECTOR vTemp3 = M.r[2];
        // x4,y4,z4,w4
        XMVECTOR vTemp4 = M.r[3];
        // z1,z1,x2,y2
        XMVECTOR vTemp = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(1, 0, 2, 2));
        // y2,z2,x3,y3 (Final)
        vTemp2 = _mm_shuffle_ps(vTemp2, vTemp3, _MM_SHUFFLE(1, 0, 2, 1));
        // x1,y1,z1,x2 (Final)
        vTemp1 = _mm_shuffle_ps(vTemp1, vTemp, _MM_SHUFFLE(2, 0, 1, 0));
        // z3,z3,x4,x4
        vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(0, 0, 2, 2));
        // z3,x4,y4,z4 (Final)
        vTemp3 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 1, 2, 0));
        // Store in 3 operations
        _mm_store_ps(&pDestination->m[0][0], vTemp1);
        _mm_store_ps(&pDestination->m[1][1], vTemp2);
        _mm_store_ps(&pDestination->m[2][2], vTemp3);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat3x4(
        XMFLOAT3X4 *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[1].vector4_f32[0];
        pDestination->m[0][2] = M.r[2].vector4_f32[0];
        pDestination->m[0][3] = M.r[3].vector4_f32[0];

        pDestination->m[1][0] = M.r[0].vector4_f32[1];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[2].vector4_f32[1];
        pDestination->m[1][3] = M.r[3].vector4_f32[1];

        pDestination->m[2][0] = M.r[0].vector4_f32[2];
        pDestination->m[2][1] = M.r[1].vector4_f32[2];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];
        pDestination->m[2][3] = M.r[3].vector4_f32[2];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
        float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);

        float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
        float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

        vst1q_f32(&pDestination->m[0][0], T0.val[0]);
        vst1q_f32(&pDestination->m[1][0], T0.val[1]);
        vst1q_f32(&pDestination->m[2][0], T1.val[0]);
#elif defined(_XM_SSE_INTRINSICS_)
        // x.x,x.y,y.x,y.y
        XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
        // x.z,x.w,y.z,y.w
        XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
        // z.x,z.y,w.x,w.y
        XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
        // z.z,z.w,w.z,w.w
        XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));

        // x.x,y.x,z.x,w.x
        XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
        // x.y,y.y,z.y,w.y
        XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
        // x.z,y.z,z.z,w.z
        XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));

        _mm_storeu_ps(&pDestination->m[0][0], r0);
        _mm_storeu_ps(&pDestination->m[1][0], r1);
        _mm_storeu_ps(&pDestination->m[2][0], r2);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat3x4A(
        XMFLOAT3X4A *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[1].vector4_f32[0];
        pDestination->m[0][2] = M.r[2].vector4_f32[0];
        pDestination->m[0][3] = M.r[3].vector4_f32[0];

        pDestination->m[1][0] = M.r[0].vector4_f32[1];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[2].vector4_f32[1];
        pDestination->m[1][3] = M.r[3].vector4_f32[1];

        pDestination->m[2][0] = M.r[0].vector4_f32[2];
        pDestination->m[2][1] = M.r[1].vector4_f32[2];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];
        pDestination->m[2][3] = M.r[3].vector4_f32[2];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
        float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);

        float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
        float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1q_f32_ex(&pDestination->m[0][0], T0.val[0], 128);
        vst1q_f32_ex(&pDestination->m[1][0], T0.val[1], 128);
        vst1q_f32_ex(&pDestination->m[2][0], T1.val[0], 128);
#else
        vst1q_f32(&pDestination->m[0][0], T0.val[0]);
        vst1q_f32(&pDestination->m[1][0], T0.val[1]);
        vst1q_f32(&pDestination->m[2][0], T1.val[0]);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        // x.x,x.y,y.x,y.y
        XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
        // x.z,x.w,y.z,y.w
        XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
        // z.x,z.y,w.x,w.y
        XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
        // z.z,z.w,w.z,w.w
        XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));

        // x.x,y.x,z.x,w.x
        XMVECTOR r0 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
        // x.y,y.y,z.y,w.y
        XMVECTOR r1 = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
        // x.z,y.z,z.z,w.z
        XMVECTOR r2 = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));

        _mm_store_ps(&pDestination->m[0][0], r0);
        _mm_store_ps(&pDestination->m[1][0], r1);
        _mm_store_ps(&pDestination->m[2][0], r2);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4x4(
        XMFLOAT4X4 *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[0].vector4_f32[1];
        pDestination->m[0][2] = M.r[0].vector4_f32[2];
        pDestination->m[0][3] = M.r[0].vector4_f32[3];

        pDestination->m[1][0] = M.r[1].vector4_f32[0];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[1].vector4_f32[2];
        pDestination->m[1][3] = M.r[1].vector4_f32[3];

        pDestination->m[2][0] = M.r[2].vector4_f32[0];
        pDestination->m[2][1] = M.r[2].vector4_f32[1];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];
        pDestination->m[2][3] = M.r[2].vector4_f32[3];

        pDestination->m[3][0] = M.r[3].vector4_f32[0];
        pDestination->m[3][1] = M.r[3].vector4_f32[1];
        pDestination->m[3][2] = M.r[3].vector4_f32[2];
        pDestination->m[3][3] = M.r[3].vector4_f32[3];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_11), M.r[0]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_21), M.r[1]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_31), M.r[2]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_41), M.r[3]);
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_storeu_ps(&pDestination->_11, M.r[0]);
        _mm_storeu_ps(&pDestination->_21, M.r[1]);
        _mm_storeu_ps(&pDestination->_31, M.r[2]);
        _mm_storeu_ps(&pDestination->_41, M.r[3]);
#endif
    }

    //------------------------------------------------------------------------------
     inline void XM_CALLCONV XMStoreFloat4x4A(
        XMFLOAT4X4A *pDestination,
        FXMMATRIX M) noexcept
    {
        assert(pDestination);
        assert((reinterpret_cast<uintptr_t>(pDestination) & 0xF) == 0);
#if defined(_XM_NO_INTRINSICS_)

        pDestination->m[0][0] = M.r[0].vector4_f32[0];
        pDestination->m[0][1] = M.r[0].vector4_f32[1];
        pDestination->m[0][2] = M.r[0].vector4_f32[2];
        pDestination->m[0][3] = M.r[0].vector4_f32[3];

        pDestination->m[1][0] = M.r[1].vector4_f32[0];
        pDestination->m[1][1] = M.r[1].vector4_f32[1];
        pDestination->m[1][2] = M.r[1].vector4_f32[2];
        pDestination->m[1][3] = M.r[1].vector4_f32[3];

        pDestination->m[2][0] = M.r[2].vector4_f32[0];
        pDestination->m[2][1] = M.r[2].vector4_f32[1];
        pDestination->m[2][2] = M.r[2].vector4_f32[2];
        pDestination->m[2][3] = M.r[2].vector4_f32[3];

        pDestination->m[3][0] = M.r[3].vector4_f32[0];
        pDestination->m[3][1] = M.r[3].vector4_f32[1];
        pDestination->m[3][2] = M.r[3].vector4_f32[2];
        pDestination->m[3][3] = M.r[3].vector4_f32[3];

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
        vst1q_f32_ex(reinterpret_cast<float *>(&pDestination->_11), M.r[0], 128);
        vst1q_f32_ex(reinterpret_cast<float *>(&pDestination->_21), M.r[1], 128);
        vst1q_f32_ex(reinterpret_cast<float *>(&pDestination->_31), M.r[2], 128);
        vst1q_f32_ex(reinterpret_cast<float *>(&pDestination->_41), M.r[3], 128);
#else
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_11), M.r[0]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_21), M.r[1]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_31), M.r[2]);
        vst1q_f32(reinterpret_cast<float *>(&pDestination->_41), M.r[3]);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
        _mm_store_ps(&pDestination->_11, M.r[0]);
        _mm_store_ps(&pDestination->_21, M.r[1]);
        _mm_store_ps(&pDestination->_31, M.r[2]);
        _mm_store_ps(&pDestination->_41, M.r[3]);
#endif
    }

#if defined(_XM_NO_INTRINSICS_)
#define XMISNAN(x)  isnan(x)
#define XMISINF(x)  isinf(x)
#endif

#if defined(_XM_SSE_INTRINSICS_)

#define XM3UNPACK3INTO4(l1, l2, l3) \
    XMVECTOR V3 = _mm_shuffle_ps(l2, l3, _MM_SHUFFLE(0, 0, 3, 2));\
    XMVECTOR V2 = _mm_shuffle_ps(l2, l1, _MM_SHUFFLE(3, 3, 1, 0));\
    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 0, 2));\
    XMVECTOR V4 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(L3), 32 / 8))

#define XM3PACK4INTO3(v2x) \
    v2x = _mm_shuffle_ps(V2, V3, _MM_SHUFFLE(1, 0, 2, 1));\
    V2 = _mm_shuffle_ps(V2, V1, _MM_SHUFFLE(2, 2, 0, 0));\
    V1 = _mm_shuffle_ps(V1, V2, _MM_SHUFFLE(0, 2, 1, 0));\
    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(0, 0, 2, 2));\
    V3 = _mm_shuffle_ps(V3, V4, _MM_SHUFFLE(2, 1, 2, 0))

#endif

/****************************************************************************
 *
 * General Vector
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Assignment operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------
 // Return a vector with all elements equaling zero
inline XMVECTOR XM_CALLCONV XMVectorZero() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_n_f32(0);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_setzero_ps();
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with four floating point values
inline XMVECTOR XM_CALLCONV XMVectorSet
(
    float x,
    float y,
    float z,
    float w
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { { x, y, z, w } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t V0 = vcreate_f32(
        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&x))
        | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&y)) << 32));
    float32x2_t V1 = vcreate_f32(
        static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&z))
        | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&w)) << 32));
    return vcombine_f32(V0, V1);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_set_ps(w, z, y, x);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with four integer values
inline XMVECTOR XM_CALLCONV XMVectorSetInt
(
    uint32_t x,
    uint32_t y,
    uint32_t z,
    uint32_t w
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult = { { { x, y, z, w } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t V0 = vcreate_u32(static_cast<uint64_t>(x) | (static_cast<uint64_t>(y) << 32));
    uint32x2_t V1 = vcreate_u32(static_cast<uint64_t>(z) | (static_cast<uint64_t>(w) << 32));
    return vreinterpretq_f32_u32(vcombine_u32(V0, V1));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_set_epi32(static_cast<int>(w), static_cast<int>(z), static_cast<int>(y), static_cast<int>(x));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with a replicated floating point value
inline XMVECTOR XM_CALLCONV XMVectorReplicate(float Value) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = Value;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_n_f32(Value);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_set_ps1(Value);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with a replicated floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(const float* pValue) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float Value = pValue[0];
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = Value;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_dup_f32(pValue);
#elif defined(_XM_AVX_INTRINSICS_)
    return _mm_broadcast_ss(pValue);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_load_ps1(pValue);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with a replicated integer value
inline XMVECTOR XM_CALLCONV XMVectorReplicateInt(uint32_t Value) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = Value;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(Value));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_set1_epi32(static_cast<int>(Value));
    return _mm_castsi128_ps(vTemp);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with a replicated integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorReplicateIntPtr(const uint32_t* pValue) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t Value = pValue[0];
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = Value;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vld1q_dup_u32(pValue));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_load_ps1(reinterpret_cast<const float*>(pValue));
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with all bits set (true mask)
inline XMVECTOR XM_CALLCONV XMVectorTrueInt() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult = { { { 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU, 0xFFFFFFFFU } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_s32(vdupq_n_s32(-1));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_set1_epi32(-1);
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------
// Initialize a vector with all bits clear (false mask)
inline XMVECTOR XM_CALLCONV XMVectorFalseInt() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { { 0.0f, 0.0f, 0.0f, 0.0f } } };
    return vResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(0));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_setzero_ps();
#endif
}

//------------------------------------------------------------------------------
// Replicate the x component of the vector
inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = V.vector4_f32[0];
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_lane_f32(vget_low_f32(V), 0);
#elif defined(_XM_AVX2_INTRINSICS_) && defined(_XM_FAVOR_INTEL_)
    return _mm_broadcastss_ps(V);
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
#endif
}

//------------------------------------------------------------------------------
// Replicate the y component of the vector
inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = V.vector4_f32[1];
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_lane_f32(vget_low_f32(V), 1);
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
#endif
}

//------------------------------------------------------------------------------
// Replicate the z component of the vector
inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = V.vector4_f32[2];
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_lane_f32(vget_high_f32(V), 0);
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
#endif
}

//------------------------------------------------------------------------------
// Replicate the w component of the vector
inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = V.vector4_f32[3];
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_lane_f32(vget_high_f32(V), 1);
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
#endif
}

//------------------------------------------------------------------------------
// Return a vector of 1.0f,1.0f,1.0f,1.0f
inline XMVECTOR XM_CALLCONV XMVectorSplatOne() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = 1.0f;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vdupq_n_f32(1.0f);
#elif defined(_XM_SSE_INTRINSICS_)
    return g_XMOne;
#endif
}

//------------------------------------------------------------------------------
// Return a vector of INF,INF,INF,INF
inline XMVECTOR XM_CALLCONV XMVectorSplatInfinity() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = 0x7F800000;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(0x7F800000));
#elif defined(_XM_SSE_INTRINSICS_)
    return g_XMInfinity;
#endif
}

//------------------------------------------------------------------------------
// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
inline XMVECTOR XM_CALLCONV XMVectorSplatQNaN() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = 0x7FC00000;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(0x7FC00000));
#elif defined(_XM_SSE_INTRINSICS_)
    return g_XMQNaN;
#endif
}

//------------------------------------------------------------------------------
// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
inline XMVECTOR XM_CALLCONV XMVectorSplatEpsilon() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = 0x34000000;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(0x34000000));
#elif defined(_XM_SSE_INTRINSICS_)
    return g_XMEpsilon;
#endif
}

//------------------------------------------------------------------------------
// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
inline XMVECTOR XM_CALLCONV XMVectorSplatSignMask() noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 vResult;
    vResult.u[0] =
        vResult.u[1] =
        vResult.u[2] =
        vResult.u[3] = 0x80000000U;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vdupq_n_u32(0x80000000U));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_set1_epi32(static_cast<int>(0x80000000));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------
// Return a floating point value via an index. This is not a recommended
// function to use due to performance loss.
inline float XM_CALLCONV XMVectorGetByIndex(FXMVECTOR V, size_t i) noexcept
{
    assert(i < 4);
    
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_f32[i];
#else
    XMVECTORF32 U;
    U.v = V;
    return U.f[i];
#endif
}

//------------------------------------------------------------------------------
// Return the X component in an FPU register.
inline float XM_CALLCONV XMVectorGetX(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_f32[0];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_f32(V, 0);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cvtss_f32(V);
#endif
}

// Return the Y component in an FPU register.
inline float XM_CALLCONV XMVectorGetY(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_f32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_f32(V, 1);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
    return _mm_cvtss_f32(vTemp);
#endif
}

// Return the Z component in an FPU register.
inline float XM_CALLCONV XMVectorGetZ(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_f32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_f32(V, 2);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
    return _mm_cvtss_f32(vTemp);
#endif
}

// Return the W component in an FPU register.
inline float XM_CALLCONV XMVectorGetW(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_f32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_f32(V, 3);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
    return _mm_cvtss_f32(vTemp);
#endif
}

//------------------------------------------------------------------------------

// Store a component indexed by i into a 32 bit float location in memory.

inline void XM_CALLCONV XMVectorGetByIndexPtr(float* f, FXMVECTOR V, size_t i) noexcept
{
    assert(f != nullptr);
    assert(i < 4);
    
#if defined(_XM_NO_INTRINSICS_)
    *f = V.vector4_f32[i];
#else
    XMVECTORF32 U;
    U.v = V;
    *f = U.f[i];
#endif
}

//------------------------------------------------------------------------------

// Store the X component into a 32 bit float location in memory.

inline void XM_CALLCONV XMVectorGetXPtr(float* x, FXMVECTOR V) noexcept
{
    assert(x != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *x = V.vector4_f32[0];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(x, V, 0);
#elif defined(_XM_SSE_INTRINSICS_)
    _mm_store_ss(x, V);
#endif
}

// Store the Y component into a 32 bit float location in memory.

inline void XM_CALLCONV XMVectorGetYPtr(float* y, FXMVECTOR V) noexcept
{
    assert(y != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *y = V.vector4_f32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(y, V, 1);
#elif defined(_XM_SSE4_INTRINSICS_)
    * (reinterpret_cast<int*>(y)) = _mm_extract_ps(V, 1);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
    _mm_store_ss(y, vResult);
#endif
}

// Store the Z component into a 32 bit float location in memory.

inline void XM_CALLCONV XMVectorGetZPtr(float* z, FXMVECTOR V) noexcept
{
    assert(z != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *z = V.vector4_f32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(z, V, 2);
#elif defined(_XM_SSE4_INTRINSICS_)
    * (reinterpret_cast<int*>(z)) = _mm_extract_ps(V, 2);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
    _mm_store_ss(z, vResult);
#endif
}

// Store the W component into a 32 bit float location in memory.

inline void XM_CALLCONV XMVectorGetWPtr(float* w, FXMVECTOR V) noexcept
{
    assert(w != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *w = V.vector4_f32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_f32(w, V, 3);
#elif defined(_XM_SSE4_INTRINSICS_)
    * (reinterpret_cast<int*>(w)) = _mm_extract_ps(V, 3);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
    _mm_store_ss(w, vResult);
#endif
}

//------------------------------------------------------------------------------

// Return an integer value via an index. This is not a recommended
// function to use due to performance loss.
inline uint32_t XM_CALLCONV XMVectorGetIntByIndex(FXMVECTOR V, size_t i) noexcept
{
    assert(i < 4);
    
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_u32[i];
#else
    XMVECTORU32 U;
    U.v = V;
    return U.u[i];
#endif
}

//------------------------------------------------------------------------------

// Return the X component in an integer register.
inline uint32_t XM_CALLCONV XMVectorGetIntX(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_u32[0];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 0);
#elif defined(_XM_SSE_INTRINSICS_)
    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
#endif
}

// Return the Y component in an integer register.
inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 1);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    return static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(1, 1, 1, 1));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
#endif
}

// Return the Z component in an integer register.
inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 2);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    return static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(2, 2, 2, 2));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
#endif
}

// Return the W component in an integer register.
inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vgetq_lane_u32(vreinterpretq_u32_f32(V), 3);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    return static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V), _MM_SHUFFLE(3, 3, 3, 3));
    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
#endif
}

//------------------------------------------------------------------------------

// Store a component indexed by i into a 32 bit integer location in memory.

inline void XM_CALLCONV XMVectorGetIntByIndexPtr(uint32_t* x, FXMVECTOR V, size_t i) noexcept
{
    assert(x != nullptr);
    assert(i < 4);
    
#if defined(_XM_NO_INTRINSICS_)
    *x = V.vector4_u32[i];
#else
    XMVECTORU32 U;
    U.v = V;
    *x = U.u[i];
#endif
}

//------------------------------------------------------------------------------

// Store the X component into a 32 bit integer location in memory.

inline void XM_CALLCONV XMVectorGetIntXPtr(uint32_t* x, FXMVECTOR V) noexcept
{
    assert(x != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *x = V.vector4_u32[0];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0);
#elif defined(_XM_SSE_INTRINSICS_)
    _mm_store_ss(reinterpret_cast<float*>(x), V);
#endif
}

// Store the Y component into a 32 bit integer location in memory.

inline void XM_CALLCONV XMVectorGetIntYPtr(uint32_t* y, FXMVECTOR V) noexcept
{
    assert(y != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *y = V.vector4_u32[1];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    *y = static_cast<uint32_t>(_mm_extract_epi32(V1, 1));
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
    _mm_store_ss(reinterpret_cast<float*>(y), vResult);
#endif
}

// Store the Z component into a 32 bit integer locaCantion in memory.

inline void XM_CALLCONV XMVectorGetIntZPtr(uint32_t* z, FXMVECTOR V) noexcept
{
    assert(z != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *z = V.vector4_u32[2];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    *z = static_cast<uint32_t>(_mm_extract_epi32(V1, 2));
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
    _mm_store_ss(reinterpret_cast<float*>(z), vResult);
#endif
}

// Store the W component into a 32 bit integer location in memory.

inline void XM_CALLCONV XMVectorGetIntWPtr(uint32_t* w, FXMVECTOR V) noexcept
{
    assert(w != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    *w = V.vector4_u32[3];
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    vst1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3);
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i V1 = _mm_castps_si128(V);
    *w = static_cast<uint32_t>(_mm_extract_epi32(V1, 3));
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
    _mm_store_ss(reinterpret_cast<float*>(w), vResult);
#endif
}

//------------------------------------------------------------------------------

// Set a single indexed floating point component
inline XMVECTOR XM_CALLCONV XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) noexcept
{
    assert(i < 4);
    
    XMVECTORF32 U;
    U.v = V;
    U.f[i] = f;
    return U.v;
}

//------------------------------------------------------------------------------

// Sets the X component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetX(FXMVECTOR V, float x) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            x,
            V.vector4_f32[1],
            V.vector4_f32[2],
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(x, V, 0);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_set_ss(x);
    vResult = _mm_move_ss(V, vResult);
    return vResult;
#endif
}

// Sets the Y component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            y,
            V.vector4_f32[2],
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(y, V, 1);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vResult = _mm_set_ss(y);
    vResult = _mm_insert_ps(V, vResult, 0x10);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    // Convert input to vector
    XMVECTOR vTemp = _mm_set_ss(y);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap y and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
    return vResult;
#endif
}
// Sets the Z component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            V.vector4_f32[1],
            z,
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(z, V, 2);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vResult = _mm_set_ss(z);
    vResult = _mm_insert_ps(V, vResult, 0x20);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
    // Convert input to vector
    XMVECTOR vTemp = _mm_set_ss(z);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap z and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
    return vResult;
#endif
}

// Sets the W component of a vector to a passed floating point value
inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            V.vector4_f32[1],
            V.vector4_f32[2],
            w
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsetq_lane_f32(w, V, 3);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vResult = _mm_set_ss(w);
    vResult = _mm_insert_ps(V, vResult, 0x30);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
    // Convert input to vector
    XMVECTOR vTemp = _mm_set_ss(w);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap w and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

// Sets a component of a vector to a floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetByIndexPtr(FXMVECTOR V, const float* f, size_t i) noexcept
{
    assert(f != nullptr);
    assert(i < 4);
    
    XMVECTORF32 U;
    U.v = V;
    U.f[i] = *f;
    return U.v;
}

//------------------------------------------------------------------------------

// Sets the X component of a vector to a floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetXPtr(FXMVECTOR V, const float* x) noexcept
{
    assert(x != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            *x,
            V.vector4_f32[1],
            V.vector4_f32[2],
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_lane_f32(x, V, 0);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_load_ss(x);
    vResult = _mm_move_ss(V, vResult);
    return vResult;
#endif
}

// Sets the Y component of a vector to a floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetYPtr(FXMVECTOR V, const float* y) noexcept
{
    assert(y != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            *y,
            V.vector4_f32[2],
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_lane_f32(y, V, 1);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(y);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap y and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
    return vResult;
#endif
}

// Sets the Z component of a vector to a floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetZPtr(FXMVECTOR V, const float* z) noexcept
{
    assert(z != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            V.vector4_f32[1],
            *z,
            V.vector4_f32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_lane_f32(z, V, 2);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(z);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap z and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
    return vResult;
#endif
}

// Sets the W component of a vector to a floating point value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetWPtr(FXMVECTOR V, const float* w) noexcept
{
    assert(w != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 U = { { {
            V.vector4_f32[0],
            V.vector4_f32[1],
            V.vector4_f32[2],
            *w
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vld1q_lane_f32(w, V, 3);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(w);
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap w and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

// Sets a component of a vector to an integer passed by value
inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) noexcept
{
    assert(i < 4);
    
    XMVECTORU32 tmp;
    tmp.v = V;
    tmp.u[i] = x;
    return tmp;
}

//------------------------------------------------------------------------------

// Sets the X component of a vector to an integer passed by value
inline XMVECTOR XM_CALLCONV XMVectorSetIntX(FXMVECTOR V, uint32_t x) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            x,
            V.vector4_u32[1],
            V.vector4_u32[2],
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vsetq_lane_u32(x, vreinterpretq_u32_f32(V), 0));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(x));
    XMVECTOR vResult = _mm_move_ss(V, _mm_castsi128_ps(vTemp));
    return vResult;
#endif
}

// Sets the Y component of a vector to an integer passed by value
inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            y,
            V.vector4_u32[2],
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vsetq_lane_u32(y, vreinterpretq_u32_f32(V), 1));
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i vResult = _mm_castps_si128(V);
    vResult = _mm_insert_epi32(vResult, static_cast<int>(y), 1);
    return _mm_castsi128_ps(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    // Convert input to vector
    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(y));
    // Replace the x component
    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
    // Swap y and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
    return vResult;
#endif
}

// Sets the Z component of a vector to an integer passed by value
inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            V.vector4_u32[1],
            z,
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vsetq_lane_u32(z, vreinterpretq_u32_f32(V), 2));
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i vResult = _mm_castps_si128(V);
    vResult = _mm_insert_epi32(vResult, static_cast<int>(z), 2);
    return _mm_castsi128_ps(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
    // Convert input to vector
    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(z));
    // Replace the x component
    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
    // Swap z and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
    return vResult;
#endif
}

// Sets the W component of a vector to an integer passed by value
inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            V.vector4_u32[1],
            V.vector4_u32[2],
            w
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vsetq_lane_u32(w, vreinterpretq_u32_f32(V), 3));
#elif defined(_XM_SSE4_INTRINSICS_)
    __m128i vResult = _mm_castps_si128(V);
    vResult = _mm_insert_epi32(vResult, static_cast<int>(w), 3);
    return _mm_castsi128_ps(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
    // Convert input to vector
    __m128i vTemp = _mm_cvtsi32_si128(static_cast<int>(w));
    // Replace the x component
    vResult = _mm_move_ss(vResult, _mm_castsi128_ps(vTemp));
    // Swap w and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

// Sets a component of a vector to an integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t* x, size_t i) noexcept
{
    assert(x != nullptr);
    assert(i < 4);
    
    XMVECTORU32 tmp;
    tmp.v = V;
    tmp.u[i] = *x;
    return tmp;
}

//------------------------------------------------------------------------------

// Sets the X component of a vector to an integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t* x) noexcept
{
    assert(x != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            *x,
            V.vector4_u32[1],
            V.vector4_u32[2],
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vld1q_lane_u32(x, *reinterpret_cast<const uint32x4_t*>(&V), 0));
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(x));
    XMVECTOR vResult = _mm_move_ss(V, vTemp);
    return vResult;
#endif
}

// Sets the Y component of a vector to an integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t* y) noexcept
{
    assert(y != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            *y,
            V.vector4_u32[2],
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vld1q_lane_u32(y, *reinterpret_cast<const uint32x4_t*>(&V), 1));
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap y and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(y));
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap y and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 2, 0, 1));
    return vResult;
#endif
}

// Sets the Z component of a vector to an integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t* z) noexcept
{
    assert(z != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            V.vector4_u32[1],
            *z,
            V.vector4_u32[3]
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vld1q_lane_u32(z, *reinterpret_cast<const uint32x4_t*>(&V), 2));
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap z and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 0, 1, 2));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(z));
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap z and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 0, 1, 2));
    return vResult;
#endif
}

// Sets the W component of a vector to an integer value passed by pointer

inline XMVECTOR XM_CALLCONV XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t* w) noexcept
{
    assert(w != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORU32 U = { { {
            V.vector4_u32[0],
            V.vector4_u32[1],
            V.vector4_u32[2],
            *w
        } } };
    return U.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vld1q_lane_u32(w, *reinterpret_cast<const uint32x4_t*>(&V), 3));
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap w and x
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 2, 1, 3));
    // Convert input to vector
    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float*>(w));
    // Replace the x component
    vResult = _mm_move_ss(vResult, vTemp);
    // Swap w and x again
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 2, 1, 3));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSwizzle
(
    FXMVECTOR V,
    uint32_t E0,
    uint32_t E1,
    uint32_t E2,
    uint32_t E3
) noexcept
{
    assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
    
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            V.vector4_f32[E0],
            V.vector4_f32[E1],
            V.vector4_f32[E2],
            V.vector4_f32[E3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const uint32_t ControlElement[4] =
    {
        0x03020100, // XM_SWIZZLE_X
        0x07060504, // XM_SWIZZLE_Y
        0x0B0A0908, // XM_SWIZZLE_Z
        0x0F0E0D0C, // XM_SWIZZLE_W
    };

    uint8x8x2_t tbl;
    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));

    uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(ControlElement[E0]) | (static_cast<uint64_t>(ControlElement[E1]) << 32));
    const uint8x8_t rL = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));

    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[E2]) | (static_cast<uint64_t>(ControlElement[E3]) << 32));
    const uint8x8_t rH = vtbl2_u8(tbl, vreinterpret_u8_u32(idx));

    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
#elif defined(_XM_AVX_INTRINSICS_)
    unsigned int elem[4] = { E0, E1, E2, E3 };
    __m128i vControl = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&elem[0]));
    return _mm_permutevar_ps(V, vControl);
#else
    auto aPtr = reinterpret_cast<const uint32_t*>(&V);

    XMVECTOR Result;
    auto pWork = reinterpret_cast<uint32_t*>(&Result);

    pWork[0] = aPtr[E0];
    pWork[1] = aPtr[E1];
    pWork[2] = aPtr[E2];
    pWork[3] = aPtr[E3];

    return Result;
#endif
}

//------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVectorPermute
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    uint32_t PermuteX,
    uint32_t PermuteY,
    uint32_t PermuteZ,
    uint32_t PermuteW
) noexcept
{
    assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
    

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    static const uint32_t ControlElement[8] =
    {
        0x03020100, // XM_PERMUTE_0X
        0x07060504, // XM_PERMUTE_0Y
        0x0B0A0908, // XM_PERMUTE_0Z
        0x0F0E0D0C, // XM_PERMUTE_0W
        0x13121110, // XM_PERMUTE_1X
        0x17161514, // XM_PERMUTE_1Y
        0x1B1A1918, // XM_PERMUTE_1Z
        0x1F1E1D1C, // XM_PERMUTE_1W
    };

    uint8x8x4_t tbl;
    tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V1));
    tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V1));
    tbl.val[2] = vreinterpret_u8_f32(vget_low_f32(V2));
    tbl.val[3] = vreinterpret_u8_f32(vget_high_f32(V2));

    uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteX]) | (static_cast<uint64_t>(ControlElement[PermuteY]) << 32));
    const uint8x8_t rL = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));

    idx = vcreate_u32(static_cast<uint64_t>(ControlElement[PermuteZ]) | (static_cast<uint64_t>(ControlElement[PermuteW]) << 32));
    const uint8x8_t rH = vtbl4_u8(tbl, vreinterpret_u8_u32(idx));

    return vcombine_f32(vreinterpret_f32_u8(rL), vreinterpret_f32_u8(rH));
#elif defined(_XM_AVX_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };

    XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
    __m128i vControl = _mm_load_si128(reinterpret_cast<const __m128i*>(&elem[0]));

    __m128i vSelect = _mm_cmpgt_epi32(vControl, three);
    vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));

    __m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
    __m128 shuffled2 = _mm_permutevar_ps(V2, vControl);

    __m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
    __m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);

    return _mm_or_ps(masked1, masked2);
#else

    const uint32_t* aPtr[2];
    aPtr[0] = reinterpret_cast<const uint32_t*>(&V1);
    aPtr[1] = reinterpret_cast<const uint32_t*>(&V2);

    XMVECTOR Result;
    auto pWork = reinterpret_cast<uint32_t*>(&Result);

    const uint32_t i0 = PermuteX & 3;
    const uint32_t vi0 = PermuteX >> 2;
    pWork[0] = aPtr[vi0][i0];

    const uint32_t i1 = PermuteY & 3;
    const uint32_t vi1 = PermuteY >> 2;
    pWork[1] = aPtr[vi1][i1];

    const uint32_t i2 = PermuteZ & 3;
    const uint32_t vi2 = PermuteZ >> 2;
    pWork[2] = aPtr[vi2][i2];

    const uint32_t i3 = PermuteW & 3;
    const uint32_t vi3 = PermuteW >> 2;
    pWork[3] = aPtr[vi3][i3];

    return Result;
#endif
}

//------------------------------------------------------------------------------
// Define a control vector to be used in XMVectorSelect
// operations.  The four integers specified in XMVectorSelectControl
// serve as indices to select between components in two vectors.
// The first index controls selection for the first component of
// the vectors involved in a select operation, the second index
// controls selection for the second component etc.  A value of
// zero for an index causes the corresponding component from the first
// vector to be selected whereas a one causes the component from the
// second vector to be selected instead.

inline XMVECTOR XM_CALLCONV XMVectorSelectControl
(
    uint32_t VectorIndex0,
    uint32_t VectorIndex1,
    uint32_t VectorIndex2,
    uint32_t VectorIndex3
) noexcept
{
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    // x=Index0,y=Index1,z=Index2,w=Index3
    __m128i vTemp = _mm_set_epi32(static_cast<int>(VectorIndex3), static_cast<int>(VectorIndex2), static_cast<int>(VectorIndex1), static_cast<int>(VectorIndex0));
    // Any non-zero entries become 0xFFFFFFFF else 0
    vTemp = _mm_cmpgt_epi32(vTemp, g_XMZero);
    return _mm_castsi128_ps(vTemp);
#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    int32x2_t V0 = vcreate_s32(static_cast<uint64_t>(VectorIndex0) | (static_cast<uint64_t>(VectorIndex1) << 32));
    int32x2_t V1 = vcreate_s32(static_cast<uint64_t>(VectorIndex2) | (static_cast<uint64_t>(VectorIndex3) << 32));
    int32x4_t vTemp = vcombine_s32(V0, V1);
    // Any non-zero entries become 0xFFFFFFFF else 0
    return vreinterpretq_f32_u32(vcgtq_s32(vTemp, g_XMZero));
#else
    XMVECTOR    ControlVector;
    const uint32_t  ControlElement[] =
    {
        XM_SELECT_0,
        XM_SELECT_1
    };

    assert(VectorIndex0 < 2);
    assert(VectorIndex1 < 2);
    assert(VectorIndex2 < 2);
    assert(VectorIndex3 < 2);
    
    
    
    

    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];

    return ControlVector;

#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSelect
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR Control
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]),
            (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]),
            (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]),
            (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]),
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vbslq_f32(vreinterpretq_u32_f32(Control), V2, V1);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp1 = _mm_andnot_ps(Control, V1);
    XMVECTOR vTemp2 = _mm_and_ps(V2, Control);
    return _mm_or_ps(vTemp1, vTemp2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMergeXY
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[0],
            V2.vector4_u32[0],
            V1.vector4_u32[1],
            V2.vector4_u32[1],
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vzipq_f32(V1, V2).val[0];
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_unpacklo_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMergeZW
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[2],
            V2.vector4_u32[2],
            V1.vector4_u32[3],
            V2.vector4_u32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vzipq_f32(V1, V2).val[1];
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_unpackhi_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) noexcept
{
    assert(Elements < 4);
    
    return XMVectorPermute(V1, V2, Elements, ((Elements)+1), ((Elements)+2), ((Elements)+3));
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) noexcept
{
    assert(Elements < 4);
    
    return XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) noexcept
{
    assert(Elements < 4);
    
    return XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorInsert(
    FXMVECTOR VD, FXMVECTOR VS,
    uint32_t VSLeftRotateElements,
    uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) noexcept
{
    XMVECTOR Control = XMVectorSelectControl(Select0 & 1, Select1 & 1, Select2 & 1, Select3 & 1);
    return XMVectorSelect(VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control);
}

//------------------------------------------------------------------------------
// Comparison operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vceqq_f32(V1, V2));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmpeq_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------


inline XMVECTOR XM_CALLCONV XMVectorEqualR
(
    uint32_t* pCR,
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    assert(pCR != nullptr);
#if defined(_XM_NO_INTRINSICS_)
    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
    uint32_t CR = 0;
    if (ux & uy & uz & uw)
    {
        // All elements are greater
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!(ux | uy | uz | uw))
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;

    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
    return Control;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vreinterpret_u8_u32(vget_low_u32(vResult)), vreinterpret_u8_u32(vget_high_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        // All elements are equal
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        // All elements are not equal
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vreinterpretq_f32_u32(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    uint32_t CR = 0;
    int iTest = _mm_movemask_ps(vTemp);
    if (iTest == 0xf)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vTemp;
#endif
}

//------------------------------------------------------------------------------
// Treat the components of the vectors as unsigned integers and
// compare individual bits between the two.  This is useful for
// comparing control vectors and result vectors returned from
// other comparison operations.

inline XMVECTOR XM_CALLCONV XMVectorEqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0,
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vceqq_s32(vreinterpretq_s32_f32(V1), vreinterpretq_s32_f32(V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------


inline XMVECTOR XM_CALLCONV XMVectorEqualIntR
(
    uint32_t* pCR,
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    assert(pCR != nullptr);
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Control = XMVectorEqualInt(V1, V2);

    *pCR = 0;
    if (XMVector4EqualInt(Control, XMVectorTrueInt()))
    {
        // All elements are equal
        *pCR |= XM_CRMASK_CR6TRUE;
    }
    else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
    {
        // All elements are not equal
        *pCR |= XM_CRMASK_CR6FALSE;
    }
    return Control;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        // All elements are equal
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        // All elements are not equal
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vreinterpretq_f32_u32(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(V));
    uint32_t CR = 0;
    if (iTemp == 0x0F)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTemp)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNearEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR Epsilon
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float fDeltax = V1.vector4_f32[0] - V2.vector4_f32[0];
    float fDeltay = V1.vector4_f32[1] - V2.vector4_f32[1];
    float fDeltaz = V1.vector4_f32[2] - V2.vector4_f32[2];
    float fDeltaw = V1.vector4_f32[3] - V2.vector4_f32[3];

    fDeltax = fabsf(fDeltax);
    fDeltay = fabsf(fDeltay);
    fDeltaz = fabsf(fDeltaz);
    fDeltaw = fabsf(fDeltaw);

    XMVECTORU32 Control = { { {
            (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
            (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
            (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
            (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0,
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vDelta = vsubq_f32(V1, V2);
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
    return vacleq_f32(vDelta, Epsilon);
#else
    return vreinterpretq_f32_u32(vcleq_f32(vabsq_f32(vDelta), Epsilon));
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    // Get the difference
    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
    // Get the absolute value of the difference
    XMVECTOR vTemp = _mm_setzero_ps();
    vTemp = _mm_sub_ps(vTemp, vDelta);
    vTemp = _mm_max_ps(vTemp, vDelta);
    vTemp = _mm_cmple_ps(vTemp, Epsilon);
    return vTemp;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNotEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0,
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(V1, V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmpneq_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNotEqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0,
            (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0,
            (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0,
            (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vmvnq_u32(
            vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2))));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return _mm_xor_ps(_mm_castsi128_ps(V), g_XMNegOneMask);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorGreater
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vcgtq_f32(V1, V2));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmpgt_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------


inline XMVECTOR XM_CALLCONV XMVectorGreaterR
(
    uint32_t* pCR,
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    assert(pCR != nullptr);
#if defined(_XM_NO_INTRINSICS_)

    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
    uint32_t CR = 0;
    if (ux & uy & uz & uw)
    {
        // All elements are greater
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!(ux | uy | uz | uw))
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;

    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgtq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        // All elements are greater
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vreinterpretq_f32_u32(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    uint32_t CR = 0;
    int iTest = _mm_movemask_ps(vTemp);
    if (iTest == 0xf)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vTemp;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vcgeq_f32(V1, V2));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmpge_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------


inline XMVECTOR XM_CALLCONV XMVectorGreaterOrEqualR
(
    uint32_t* pCR,
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    assert(pCR != nullptr);
#if defined(_XM_NO_INTRINSICS_)

    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
    uint32_t CR = 0;
    if (ux & uy & uz & uw)
    {
        // All elements are greater
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!(ux | uy | uz | uw))
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;

    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgeq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        // All elements are greater or equal
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        // All elements are not greater or equal
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vreinterpretq_f32_u32(vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    uint32_t CR = 0;
    int iTest = _mm_movemask_ps(vTemp);
    if (iTest == 0xf)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        // All elements are not greater
        CR = XM_CRMASK_CR6FALSE;
    }
    *pCR = CR;
    return vTemp;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLess
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vcltq_f32(V1, V2));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmplt_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLessOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vcleq_f32(V1, V2));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_cmple_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorInBounds
(
    FXMVECTOR V,
    FXMVECTOR Bounds
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0,
            (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0,
            (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0,
            (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test if less than or equal
    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
    // Negate the bounds
    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
    // Test if greater or equal (Reversed)
    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
    // Blend answers
    vTemp1 = vandq_u32(vTemp1, vTemp2);
    return vreinterpretq_f32_u32(vTemp1);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test if less than or equal
    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
    // Negate the bounds
    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
    // Test if greater or equal (Reversed)
    vTemp2 = _mm_cmple_ps(vTemp2, V);
    // Blend answers
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
    return vTemp1;
#endif
}

//------------------------------------------------------------------------------


inline XMVECTOR XM_CALLCONV XMVectorInBoundsR
(
    uint32_t* pCR,
    FXMVECTOR V,
    FXMVECTOR Bounds
) noexcept
{
    assert(pCR != nullptr);
#if defined(_XM_NO_INTRINSICS_)

    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;

    uint32_t CR = 0;
    if (ux & uy & uz & uw)
    {
        // All elements are in bounds
        CR = XM_CRMASK_CR6BOUNDS;
    }
    *pCR = CR;

    XMVECTORU32 Control = { { { ux, uy, uz, uw } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test if less than or equal
    uint32x4_t vTemp1 = vcleq_f32(V, Bounds);
    // Negate the bounds
    uint32x4_t vTemp2 = vreinterpretq_u32_f32(vnegq_f32(Bounds));
    // Test if greater or equal (Reversed)
    vTemp2 = vcleq_f32(vreinterpretq_f32_u32(vTemp2), V);
    // Blend answers
    vTemp1 = vandq_u32(vTemp1, vTemp2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTemp1)), vget_high_u8(vreinterpretq_u8_u32(vTemp1)));
    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        // All elements are in bounds
        CR = XM_CRMASK_CR6BOUNDS;
    }
    *pCR = CR;
    return vreinterpretq_f32_u32(vTemp1);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test if less than or equal
    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
    // Negate the bounds
    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
    // Test if greater or equal (Reversed)
    vTemp2 = _mm_cmple_ps(vTemp2, V);
    // Blend answers
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);

    uint32_t CR = 0;
    if (_mm_movemask_ps(vTemp1) == 0xf)
    {
        // All elements are in bounds
        CR = XM_CRMASK_CR6BOUNDS;
    }
    *pCR = CR;
    return vTemp1;
#endif
}

//------------------------------------------------------------------------------

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

inline XMVECTOR XM_CALLCONV XMVectorIsNaN(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
            XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
            XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
            XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test against itself. NaN is always not equal
    uint32x4_t vTempNan = vceqq_f32(V, V);
    // Flip results
    return vreinterpretq_f32_u32(vmvnq_u32(vTempNan));
#elif defined(_XM_SSE_INTRINSICS_)
    // Test against itself. NaN is always not equal
    return _mm_cmpneq_ps(V, V);
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorIsInfinite(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Control = { { {
            XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0,
            XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0,
            XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0,
            XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0
        } } };
    return Control.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Mask off the sign bit
    uint32x4_t vTemp = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    // Compare to infinity
    vTemp = vceqq_f32(vreinterpretq_f32_u32(vTemp), g_XMInfinity);
    // If any are infinity, the signs are true.
    return vreinterpretq_f32_u32(vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bit
    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
    // Compare to infinity
    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
    // If any are infinity, the signs are true.
    return vTemp;
#endif
}

//------------------------------------------------------------------------------
// Rounding and clamping operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMin
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
            (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
            (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
            (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vminq_f32(V1, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_min_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMax
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0],
            (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1],
            (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2],
            (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vmaxq_f32(V1, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_max_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

namespace Internal
{
    // Round to nearest (even) a.k.a. banker's rounding
    inline float round_to_nearest(float x) noexcept
    {
        float i = floorf(x);
        x -= i;
        if (x < 0.5f)
            return i;
        if (x > 0.5f)
            return i + 1.f;

        float int_part;
        (void)modff(i / 2.f, &int_part);
        if ((2.f * int_part) == i)
        {
            return i;
        }

        return i + 1.f;
    }
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

inline XMVECTOR XM_CALLCONV XMVectorRound(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            Internal::round_to_nearest(V.vector4_f32[0]),
            Internal::round_to_nearest(V.vector4_f32[1]),
            Internal::round_to_nearest(V.vector4_f32[2]),
            Internal::round_to_nearest(V.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vrndnq_f32(V);
#else
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(V), g_XMNegativeZero);
    float32x4_t sMagic = vreinterpretq_f32_u32(vorrq_u32(g_XMNoFraction, sign));
    float32x4_t R1 = vaddq_f32(V, sMagic);
    R1 = vsubq_f32(R1, sMagic);
    float32x4_t R2 = vabsq_f32(V);
    uint32x4_t mask = vcleq_f32(R2, g_XMNoFraction);
    return vbslq_f32(mask, R1, V);
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_round_ps(V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 sign = _mm_and_ps(V, g_XMNegativeZero);
    __m128 sMagic = _mm_or_ps(g_XMNoFraction, sign);
    __m128 R1 = _mm_add_ps(V, sMagic);
    R1 = _mm_sub_ps(R1, sMagic);
    __m128 R2 = _mm_and_ps(V, g_XMAbsMask);
    __m128 mask = _mm_cmple_ps(R2, g_XMNoFraction);
    R2 = _mm_andnot_ps(mask, V);
    R1 = _mm_and_ps(R1, mask);
    XMVECTOR vResult = _mm_xor_ps(R1, R2);
    return vResult;
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorTruncate(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTOR Result;
    uint32_t     i;

    // Avoid C4701
    Result.vector4_f32[0] = 0.0f;

    for (i = 0; i < 4; i++)
    {
        if (XMISNAN(V.vector4_f32[i]))
        {
            Result.vector4_u32[i] = 0x7FC00000;
        }
        else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
        {
            Result.vector4_f32[i] = static_cast<float>(static_cast<int32_t>(V.vector4_f32[i]));
        }
        else
        {
            Result.vector4_f32[i] = V.vector4_f32[i];
        }
    }
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vrndq_f32(V);
#else
    float32x4_t vTest = vabsq_f32(V);
    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));

    int32x4_t vInt = vcvtq_s32_f32(V);
    float32x4_t vResult = vcvtq_f32_s32(vInt);

    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_round_ps(V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
#elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    // Get the abs value
    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
    // Convert to int and back to float for rounding with truncation
    __m128i vInt = _mm_cvttps_epi32(V);
    // Convert back to floats
    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
    // All numbers less than 8388608 will use the round to int
    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
    // All others, use the ORIGINAL value
    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorFloor(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            floorf(V.vector4_f32[0]),
            floorf(V.vector4_f32[1]),
            floorf(V.vector4_f32[2]),
            floorf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vrndmq_f32(V);
#else
    float32x4_t vTest = vabsq_f32(V);
    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
    // Truncate
    int32x4_t vInt = vcvtq_s32_f32(V);
    float32x4_t vResult = vcvtq_f32_s32(vInt);
    uint32x4_t vLargerMask = vcgtq_f32(vResult, V);
    // 0 -> 0, 0xffffffff -> -1.0f
    float32x4_t vLarger = vcvtq_f32_s32(vreinterpretq_s32_u32(vLargerMask));
    vResult = vaddq_f32(vResult, vLarger);
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_floor_ps(V);
#elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
    // Truncate
    __m128i vInt = _mm_cvttps_epi32(V);
    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
    __m128 vLarger = _mm_cmpgt_ps(vResult, V);
    // 0 -> 0, 0xffffffff -> -1.0f
    vLarger = _mm_cvtepi32_ps(_mm_castps_si128(vLarger));
    vResult = _mm_add_ps(vResult, vLarger);
    // All numbers less than 8388608 will use the round to int
    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
    // All others, use the ORIGINAL value
    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCeiling(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            ceilf(V.vector4_f32[0]),
            ceilf(V.vector4_f32[1]),
            ceilf(V.vector4_f32[2]),
            ceilf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vrndpq_f32(V);
#else
    float32x4_t vTest = vabsq_f32(V);
    vTest = vreinterpretq_f32_u32(vcltq_f32(vTest, g_XMNoFraction));
    // Truncate
    int32x4_t vInt = vcvtq_s32_f32(V);
    float32x4_t vResult = vcvtq_f32_s32(vInt);
    uint32x4_t vSmallerMask = vcltq_f32(vResult, V);
    // 0 -> 0, 0xffffffff -> -1.0f
    float32x4_t vSmaller = vcvtq_f32_s32(vreinterpretq_s32_u32(vSmallerMask));
    vResult = vsubq_f32(vResult, vSmaller);
    // All numbers less than 8388608 will use the round to int
    // All others, use the ORIGINAL value
    return vbslq_f32(vreinterpretq_u32_f32(vTest), vResult, V);
#endif
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_ceil_ps(V);
#elif defined(_XM_SSE_INTRINSICS_)
    // To handle NAN, INF and numbers greater than 8388608, use masking
    __m128i vTest = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    vTest = _mm_cmplt_epi32(vTest, g_XMNoFraction);
    // Truncate
    __m128i vInt = _mm_cvttps_epi32(V);
    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
    __m128 vSmaller = _mm_cmplt_ps(vResult, V);
    // 0 -> 0, 0xffffffff -> -1.0f
    vSmaller = _mm_cvtepi32_ps(_mm_castps_si128(vSmaller));
    vResult = _mm_sub_ps(vResult, vSmaller);
    // All numbers less than 8388608 will use the round to int
    vResult = _mm_and_ps(vResult, _mm_castsi128_ps(vTest));
    // All others, use the ORIGINAL value
    vTest = _mm_andnot_si128(vTest, _mm_castps_si128(V));
    vResult = _mm_or_ps(vResult, _mm_castsi128_ps(vTest));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorClamp
(
    FXMVECTOR V,
    FXMVECTOR Min,
    FXMVECTOR Max
) noexcept
{
    assert(XMVector4LessOrEqual(Min, Max));

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVectorMax(Min, V);
    Result = XMVectorMin(Max, Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vResult = vmaxq_f32(Min, V);
    vResult = vminq_f32(Max, vResult);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult;
    vResult = _mm_max_ps(Min, V);
    vResult = _mm_min_ps(Max, vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSaturate(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    const XMVECTOR Zero = XMVectorZero();

    return XMVectorClamp(V, Zero, g_XMOne.v);

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Set <0 to 0
    float32x4_t vResult = vmaxq_f32(V, vdupq_n_f32(0));
    // Set>1 to 1
    return vminq_f32(vResult, vdupq_n_f32(1.0f));
#elif defined(_XM_SSE_INTRINSICS_)
    // Set <0 to 0
    XMVECTOR vResult = _mm_max_ps(V, g_XMZero);
    // Set>1 to 1
    return _mm_min_ps(vResult, g_XMOne);
#endif
}

//------------------------------------------------------------------------------
// Bitwise logical operations
//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorAndInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[0] & V2.vector4_u32[0],
            V1.vector4_u32[1] & V2.vector4_u32[1],
            V1.vector4_u32[2] & V2.vector4_u32[2],
            V1.vector4_u32[3] & V2.vector4_u32[3]
        } } };
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_and_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorAndCInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[0] & ~V2.vector4_u32[0],
            V1.vector4_u32[1] & ~V2.vector4_u32[1],
            V1.vector4_u32[2] & ~V2.vector4_u32[2],
            V1.vector4_u32[3] & ~V2.vector4_u32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_andnot_si128(_mm_castps_si128(V2), _mm_castps_si128(V1));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorOrInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[0] | V2.vector4_u32[0],
            V1.vector4_u32[1] | V2.vector4_u32[1],
            V1.vector4_u32[2] | V2.vector4_u32[2],
            V1.vector4_u32[3] | V2.vector4_u32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNorInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            ~(V1.vector4_u32[0] | V2.vector4_u32[0]),
            ~(V1.vector4_u32[1] | V2.vector4_u32[1]),
            ~(V1.vector4_u32[2] | V2.vector4_u32[2]),
            ~(V1.vector4_u32[3] | V2.vector4_u32[3])
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t Result = vorrq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    return vreinterpretq_f32_u32(vbicq_u32(g_XMNegOneMask, Result));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i Result;
    Result = _mm_or_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
    Result = _mm_andnot_si128(Result, g_XMNegOneMask);
    return _mm_castsi128_ps(Result);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorXorInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORU32 Result = { { {
            V1.vector4_u32[0] ^ V2.vector4_u32[0],
            V1.vector4_u32[1] ^ V2.vector4_u32[1],
            V1.vector4_u32[2] ^ V2.vector4_u32[2],
            V1.vector4_u32[3] ^ V2.vector4_u32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2)));
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i V = _mm_xor_si128(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return _mm_castsi128_ps(V);
#endif
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNegate(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            -V.vector4_f32[0],
            -V.vector4_f32[1],
            -V.vector4_f32[2],
            -V.vector4_f32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vnegq_f32(V);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR Z;

    Z = _mm_setzero_ps();

    return _mm_sub_ps(Z, V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorAdd
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] + V2.vector4_f32[0],
            V1.vector4_f32[1] + V2.vector4_f32[1],
            V1.vector4_f32[2] + V2.vector4_f32[2],
            V1.vector4_f32[3] + V2.vector4_f32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vaddq_f32(V1, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_add_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSum(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result;
    Result.f[0] =
        Result.f[1] =
        Result.f[2] =
        Result.f[3] = V.vector4_f32[0] + V.vector4_f32[1] + V.vector4_f32[2] + V.vector4_f32[3];
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    float32x4_t vTemp = vpaddq_f32(V, V);
    return vpaddq_f32(vTemp, vTemp);
#else
    float32x2_t v1 = vget_low_f32(V);
    float32x2_t v2 = vget_high_f32(V);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    return vcombine_f32(v1, v1);
#endif
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vTemp = _mm_hadd_ps(V, V);
    return _mm_hadd_ps(vTemp, vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 3, 0, 1));
    XMVECTOR vTemp2 = _mm_add_ps(V, vTemp);
    vTemp = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 0, 3, 2));
    return _mm_add_ps(vTemp, vTemp2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorAddAngles
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    const XMVECTOR Zero = XMVectorZero();

    // Add the given angles together.  If the range of V1 is such
    // that -Pi <= V1 < Pi and the range of V2 is such that
    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
    // will be -Pi <= Result < Pi.
    XMVECTOR Result = XMVectorAdd(V1, V2);

    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);

    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);

    Result = XMVectorAdd(Result, Offset);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Adjust the angles
    float32x4_t vResult = vaddq_f32(V1, V2);
    // Less than Pi?
    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
    vOffset = vandq_u32(vOffset, g_XMTwoPi);
    // Add 2Pi to all entries less than -Pi
    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
    // Greater than or equal to Pi?
    vOffset = vcgeq_f32(vResult, g_XMPi);
    vOffset = vandq_u32(vOffset, g_XMTwoPi);
    // Sub 2Pi to all entries greater than Pi
    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Adjust the angles
    XMVECTOR vResult = _mm_add_ps(V1, V2);
    // Less than Pi?
    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
    // Add 2Pi to all entries less than -Pi
    vResult = _mm_add_ps(vResult, vOffset);
    // Greater than or equal to Pi?
    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
    // Sub 2Pi to all entries greater than Pi
    vResult = _mm_sub_ps(vResult, vOffset);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSubtract
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] - V2.vector4_f32[0],
            V1.vector4_f32[1] - V2.vector4_f32[1],
            V1.vector4_f32[2] - V2.vector4_f32[2],
            V1.vector4_f32[3] - V2.vector4_f32[3]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vsubq_f32(V1, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_sub_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSubtractAngles
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    const XMVECTOR Zero = XMVectorZero();

    // Subtract the given angles.  If the range of V1 is such
    // that -Pi <= V1 < Pi and the range of V2 is such that
    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
    // will be -Pi <= Result < Pi.
    XMVECTOR Result = XMVectorSubtract(V1, V2);

    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);

    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);

    Result = XMVectorAdd(Result, Offset);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Adjust the angles
    XMVECTOR vResult = vsubq_f32(V1, V2);
    // Less than Pi?
    uint32x4_t vOffset = vcltq_f32(vResult, g_XMNegativePi);
    vOffset = vandq_u32(vOffset, g_XMTwoPi);
    // Add 2Pi to all entries less than -Pi
    vResult = vaddq_f32(vResult, vreinterpretq_f32_u32(vOffset));
    // Greater than or equal to Pi?
    vOffset = vcgeq_f32(vResult, g_XMPi);
    vOffset = vandq_u32(vOffset, g_XMTwoPi);
    // Sub 2Pi to all entries greater than Pi
    vResult = vsubq_f32(vResult, vreinterpretq_f32_u32(vOffset));
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Adjust the angles
    XMVECTOR vResult = _mm_sub_ps(V1, V2);
    // Less than Pi?
    XMVECTOR vOffset = _mm_cmplt_ps(vResult, g_XMNegativePi);
    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
    // Add 2Pi to all entries less than -Pi
    vResult = _mm_add_ps(vResult, vOffset);
    // Greater than or equal to Pi?
    vOffset = _mm_cmpge_ps(vResult, g_XMPi);
    vOffset = _mm_and_ps(vOffset, g_XMTwoPi);
    // Sub 2Pi to all entries greater than Pi
    vResult = _mm_sub_ps(vResult, vOffset);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMultiply
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] * V2.vector4_f32[0],
            V1.vector4_f32[1] * V2.vector4_f32[1],
            V1.vector4_f32[2] * V2.vector4_f32[2],
            V1.vector4_f32[3] * V2.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vmulq_f32(V1, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_mul_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR V3
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] * V2.vector4_f32[0] + V3.vector4_f32[0],
            V1.vector4_f32[1] * V2.vector4_f32[1] + V3.vector4_f32[1],
            V1.vector4_f32[2] * V2.vector4_f32[2] + V3.vector4_f32[2],
            V1.vector4_f32[3] * V2.vector4_f32[3] + V3.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vfmaq_f32(V3, V1, V2);
#else
    return vmlaq_f32(V3, V1, V2);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_FMADD_PS(V1, V2, V3);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorDivide
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V1.vector4_f32[0] / V2.vector4_f32[0],
            V1.vector4_f32[1] / V2.vector4_f32[1],
            V1.vector4_f32[2] / V2.vector4_f32[2],
            V1.vector4_f32[3] / V2.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vdivq_f32(V1, V2);
#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x4_t Reciprocal = vrecpeq_f32(V2);
    float32x4_t S = vrecpsq_f32(Reciprocal, V2);
    Reciprocal = vmulq_f32(S, Reciprocal);
    S = vrecpsq_f32(Reciprocal, V2);
    Reciprocal = vmulq_f32(S, Reciprocal);
    return vmulq_f32(V1, Reciprocal);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_div_ps(V1, V2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR V3
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
            V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
            V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
            V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
        } } };
    return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    return vfmsq_f32(V3, V1, V2);
#else
    return vmlsq_f32(V3, V1, V2);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    return XM_FNMADD_PS(V1, V2, V3);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorScale
(
    FXMVECTOR V,
    float    ScaleFactor
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            V.vector4_f32[0] * ScaleFactor,
            V.vector4_f32[1] * ScaleFactor,
            V.vector4_f32[2] * ScaleFactor,
            V.vector4_f32[3] * ScaleFactor
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vmulq_n_f32(V, ScaleFactor);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
    return _mm_mul_ps(vResult, V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorReciprocalEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            1.f / V.vector4_f32[0],
            1.f / V.vector4_f32[1],
            1.f / V.vector4_f32[2],
            1.f / V.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vrecpeq_f32(V);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_rcp_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorReciprocal(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            1.f / V.vector4_f32[0],
            1.f / V.vector4_f32[1],
            1.f / V.vector4_f32[2],
            1.f / V.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    float32x4_t one = vdupq_n_f32(1.0f);
    return vdivq_f32(one, V);
#else
    // 2 iterations of Newton-Raphson refinement
    float32x4_t Reciprocal = vrecpeq_f32(V);
    float32x4_t S = vrecpsq_f32(Reciprocal, V);
    Reciprocal = vmulq_f32(S, Reciprocal);
    S = vrecpsq_f32(Reciprocal, V);
    return vmulq_f32(S, Reciprocal);
#endif
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_div_ps(g_XMOne, V);
#endif
}

//------------------------------------------------------------------------------
// Return an estimated square root
inline XMVECTOR XM_CALLCONV XMVectorSqrtEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            sqrtf(V.vector4_f32[0]),
            sqrtf(V.vector4_f32[1]),
            sqrtf(V.vector4_f32[2]),
            sqrtf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // 1 iteration of Newton-Raphson refinment of sqrt
    float32x4_t S0 = vrsqrteq_f32(V);
    float32x4_t P0 = vmulq_f32(V, S0);
    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
    float32x4_t S1 = vmulq_f32(S0, R0);

    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
    XMVECTOR Result = vmulq_f32(V, S1);
    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
    return XMVectorSelect(V, Result, Select);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_sqrt_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSqrt(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            sqrtf(V.vector4_f32[0]),
            sqrtf(V.vector4_f32[1]),
            sqrtf(V.vector4_f32[2]),
            sqrtf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // 3 iterations of Newton-Raphson refinment of sqrt
    float32x4_t S0 = vrsqrteq_f32(V);
    float32x4_t P0 = vmulq_f32(V, S0);
    float32x4_t R0 = vrsqrtsq_f32(P0, S0);
    float32x4_t S1 = vmulq_f32(S0, R0);
    float32x4_t P1 = vmulq_f32(V, S1);
    float32x4_t R1 = vrsqrtsq_f32(P1, S1);
    float32x4_t S2 = vmulq_f32(S1, R1);
    float32x4_t P2 = vmulq_f32(V, S2);
    float32x4_t R2 = vrsqrtsq_f32(P2, S2);
    float32x4_t S3 = vmulq_f32(S2, R2);

    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0));
    XMVECTOR Result = vmulq_f32(V, S3);
    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
    return XMVectorSelect(V, Result, Select);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_sqrt_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrtEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            1.f / sqrtf(V.vector4_f32[0]),
            1.f / sqrtf(V.vector4_f32[1]),
            1.f / sqrtf(V.vector4_f32[2]),
            1.f / sqrtf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vrsqrteq_f32(V);
#elif defined(_XM_SSE_INTRINSICS_)
    return _mm_rsqrt_ps(V);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorReciprocalSqrt(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            1.f / sqrtf(V.vector4_f32[0]),
            1.f / sqrtf(V.vector4_f32[1]),
            1.f / sqrtf(V.vector4_f32[2]),
            1.f / sqrtf(V.vector4_f32[3])
        } } };
    return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x4_t S0 = vrsqrteq_f32(V);

    float32x4_t P0 = vmulq_f32(V, S0);
    float32x4_t R0 = vrsqrtsq_f32(P0, S0);

    float32x4_t S1 = vmulq_f32(S0, R0);
    float32x4_t P1 = vmulq_f32(V, S1);
    float32x4_t R1 = vrsqrtsq_f32(P1, S1);

    return vmulq_f32(S1, R1);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_sqrt_ps(V);
    vResult = _mm_div_ps(g_XMOne, vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorExp2(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            exp2f(V.vector4_f32[0]),
            exp2f(V.vector4_f32[1]),
            exp2f(V.vector4_f32[2]),
            exp2f(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    int32x4_t itrunc = vcvtq_s32_f32(V);
    float32x4_t ftrunc = vcvtq_f32_s32(itrunc);
    float32x4_t y = vsubq_f32(V, ftrunc);

    float32x4_t poly = vmlaq_f32(g_XMExpEst6, g_XMExpEst7, y);
    poly = vmlaq_f32(g_XMExpEst5, poly, y);
    poly = vmlaq_f32(g_XMExpEst4, poly, y);
    poly = vmlaq_f32(g_XMExpEst3, poly, y);
    poly = vmlaq_f32(g_XMExpEst2, poly, y);
    poly = vmlaq_f32(g_XMExpEst1, poly, y);
    poly = vmlaq_f32(g_XMOne, poly, y);

    int32x4_t biased = vaddq_s32(itrunc, g_XMExponentBias);
    biased = vshlq_n_s32(biased, 23);
    float32x4_t result0 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);

    biased = vaddq_s32(itrunc, g_XM253);
    biased = vshlq_n_s32(biased, 23);
    float32x4_t result1 = XMVectorDivide(vreinterpretq_f32_s32(biased), poly);
    result1 = vmulq_f32(g_XMMinNormal.v, result1);

    // Use selection to handle the cases
    //  if (V is NaN) -> QNaN;
    //  else if (V sign bit set)
    //      if (V > -150)
    //         if (V.exponent < -126) -> result1
    //         else -> result0
    //      else -> +0
    //  else
    //      if (V < 128) -> result0
    //      else -> +inf

    uint32x4_t comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBin128);
    float32x4_t result2 = vbslq_f32(comp, result0, g_XMInfinity);

    comp = vcltq_s32(itrunc, g_XMSubnormalExponent);
    float32x4_t result3 = vbslq_f32(comp, result1, result0);

    comp = vcltq_s32(vreinterpretq_s32_f32(V), g_XMBinNeg150);
    float32x4_t result4 = vbslq_f32(comp, result3, g_XMZero);

    int32x4_t sign = vandq_s32(vreinterpretq_s32_f32(V), g_XMNegativeZero);
    comp = vceqq_s32(sign, g_XMNegativeZero);
    float32x4_t result5 = vbslq_f32(comp, result4, result2);

    int32x4_t t0 = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
    int32x4_t t1 = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    t0 = vreinterpretq_s32_u32(vceqq_s32(t0, g_XMZero));
    t1 = vreinterpretq_s32_u32(vceqq_s32(t1, g_XMInfinity));
    int32x4_t isNaN = vbicq_s32(t1, t0);

    float32x4_t vResult = vbslq_f32(vreinterpretq_u32_s32(isNaN), g_XMQNaN, result5);
    return vResult;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_exp2_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i itrunc = _mm_cvttps_epi32(V);
    __m128 ftrunc = _mm_cvtepi32_ps(itrunc);
    __m128 y = _mm_sub_ps(V, ftrunc);

    __m128 poly = XM_FMADD_PS(g_XMExpEst7, y, g_XMExpEst6);
    poly = XM_FMADD_PS(poly, y, g_XMExpEst5);
    poly = XM_FMADD_PS(poly, y, g_XMExpEst4);
    poly = XM_FMADD_PS(poly, y, g_XMExpEst3);
    poly = XM_FMADD_PS(poly, y, g_XMExpEst2);
    poly = XM_FMADD_PS(poly, y, g_XMExpEst1);
    poly = XM_FMADD_PS(poly, y, g_XMOne);

    __m128i biased = _mm_add_epi32(itrunc, g_XMExponentBias);
    biased = _mm_slli_epi32(biased, 23);
    __m128 result0 = _mm_div_ps(_mm_castsi128_ps(biased), poly);

    biased = _mm_add_epi32(itrunc, g_XM253);
    biased = _mm_slli_epi32(biased, 23);
    __m128 result1 = _mm_div_ps(_mm_castsi128_ps(biased), poly);
    result1 = _mm_mul_ps(g_XMMinNormal.v, result1);

    // Use selection to handle the cases
    //  if (V is NaN) -> QNaN;
    //  else if (V sign bit set)
    //      if (V > -150)
    //         if (V.exponent < -126) -> result1
    //         else -> result0
    //      else -> +0
    //  else
    //      if (V < 128) -> result0
    //      else -> +inf

    __m128i comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBin128);
    __m128i select0 = _mm_and_si128(comp, _mm_castps_si128(result0));
    __m128i select1 = _mm_andnot_si128(comp, g_XMInfinity);
    __m128i result2 = _mm_or_si128(select0, select1);

    comp = _mm_cmplt_epi32(itrunc, g_XMSubnormalExponent);
    select1 = _mm_and_si128(comp, _mm_castps_si128(result1));
    select0 = _mm_andnot_si128(comp, _mm_castps_si128(result0));
    __m128i result3 = _mm_or_si128(select0, select1);

    comp = _mm_cmplt_epi32(_mm_castps_si128(V), g_XMBinNeg150);
    select0 = _mm_and_si128(comp, result3);
    select1 = _mm_andnot_si128(comp, g_XMZero);
    __m128i result4 = _mm_or_si128(select0, select1);

    __m128i sign = _mm_and_si128(_mm_castps_si128(V), g_XMNegativeZero);
    comp = _mm_cmpeq_epi32(sign, g_XMNegativeZero);
    select0 = _mm_and_si128(comp, result4);
    select1 = _mm_andnot_si128(comp, result2);
    __m128i result5 = _mm_or_si128(select0, select1);

    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
    __m128i isNaN = _mm_andnot_si128(t0, t1);

    select0 = _mm_and_si128(isNaN, g_XMQNaN);
    select1 = _mm_andnot_si128(isNaN, result5);
    __m128i vResult = _mm_or_si128(select0, select1);

    return _mm_castsi128_ps(vResult);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorExp10(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            powf(10.0f, V.vector4_f32[0]),
            powf(10.0f, V.vector4_f32[1]),
            powf(10.0f, V.vector4_f32[2]),
            powf(10.0f, V.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_exp10_ps(V);
    return Result;
#else
    // exp10(V) = exp2(vin*log2(10))
    XMVECTOR Vten = XMVectorMultiply(g_XMLg10, V);
    return XMVectorExp2(Vten);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorExpE(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            expf(V.vector4_f32[0]),
            expf(V.vector4_f32[1]),
            expf(V.vector4_f32[2]),
            expf(V.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_exp_ps(V);
    return Result;
#else
    // expE(V) = exp2(vin*log2(e))
    XMVECTOR Ve = XMVectorMultiply(g_XMLgE, V);
    return XMVectorExp2(Ve);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorExp(FXMVECTOR V) noexcept
{
    return XMVectorExp2(V);
}

//------------------------------------------------------------------------------

#if defined(_XM_SSE_INTRINSICS_)

namespace Internal
{
    inline __m128i multi_sll_epi32(__m128i value, __m128i count) noexcept
    {
        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r0 = _mm_sll_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r1 = _mm_sll_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r2 = _mm_sll_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r3 = _mm_sll_epi32(v, c);

        // (r0,r0,r1,r1)
        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0));
        // (r2,r2,r3,r3)
        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0));
        // (r0,r1,r2,r3)
        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
        return _mm_castps_si128(result);
    }

    inline __m128i multi_srl_epi32(__m128i value, __m128i count) noexcept
    {
        __m128i v = _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 0, 0, 0));
        __m128i c = _mm_shuffle_epi32(count, _MM_SHUFFLE(0, 0, 0, 0));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r0 = _mm_srl_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(1, 1, 1, 1));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r1 = _mm_srl_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 2, 2, 2));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(2, 2, 2, 2));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r2 = _mm_srl_epi32(v, c);

        v = _mm_shuffle_epi32(value, _MM_SHUFFLE(3, 3, 3, 3));
        c = _mm_shuffle_epi32(count, _MM_SHUFFLE(3, 3, 3, 3));
        c = _mm_and_si128(c, g_XMMaskX);
        __m128i r3 = _mm_srl_epi32(v, c);

        // (r0,r0,r1,r1)
        __m128 r01 = _mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1), _MM_SHUFFLE(0, 0, 0, 0));
        // (r2,r2,r3,r3)
        __m128 r23 = _mm_shuffle_ps(_mm_castsi128_ps(r2), _mm_castsi128_ps(r3), _MM_SHUFFLE(0, 0, 0, 0));
        // (r0,r1,r2,r3)
        __m128 result = _mm_shuffle_ps(r01, r23, _MM_SHUFFLE(2, 0, 2, 0));
        return _mm_castps_si128(result);
    }

    inline __m128i GetLeadingBit(const __m128i value) noexcept
    {
        static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
        static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
        static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
        static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };

        __m128i v = value, r, c, b, s;

        c = _mm_cmpgt_epi32(v, g_XM0000FFFF);   // c = (v > 0xFFFF)
        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
        r = _mm_slli_epi32(b, 4);               // r = (b << 4)
        v = multi_srl_epi32(v, r);              // v = (v >> r)

        c = _mm_cmpgt_epi32(v, g_XM000000FF);   // c = (v > 0xFF)
        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
        s = _mm_slli_epi32(b, 3);               // s = (b << 3)
        v = multi_srl_epi32(v, s);              // v = (v >> s)
        r = _mm_or_si128(r, s);                 // r = (r | s)

        c = _mm_cmpgt_epi32(v, g_XM0000000F);   // c = (v > 0xF)
        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
        s = _mm_slli_epi32(b, 2);               // s = (b << 2)
        v = multi_srl_epi32(v, s);              // v = (v >> s)
        r = _mm_or_si128(r, s);                 // r = (r | s)

        c = _mm_cmpgt_epi32(v, g_XM00000003);   // c = (v > 0x3)
        b = _mm_srli_epi32(c, 31);              // b = (c ? 1 : 0)
        s = _mm_slli_epi32(b, 1);               // s = (b << 1)
        v = multi_srl_epi32(v, s);              // v = (v >> s)
        r = _mm_or_si128(r, s);                 // r = (r | s)

        s = _mm_srli_epi32(v, 1);
        r = _mm_or_si128(r, s);
        return r;
    }
} // namespace Internal

#endif // _XM_SSE_INTRINSICS_

#if defined(_XM_ARM_NEON_INTRINSICS_)

namespace Internal
{
    inline int32x4_t GetLeadingBit(const int32x4_t value) noexcept
    {
        static const XMVECTORI32 g_XM0000FFFF = { { { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF } } };
        static const XMVECTORI32 g_XM000000FF = { { { 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF } } };
        static const XMVECTORI32 g_XM0000000F = { { { 0x0000000F, 0x0000000F, 0x0000000F, 0x0000000F } } };
        static const XMVECTORI32 g_XM00000003 = { { { 0x00000003, 0x00000003, 0x00000003, 0x00000003 } } };

        uint32x4_t c = vcgtq_s32(value, g_XM0000FFFF);              // c = (v > 0xFFFF)
        int32x4_t b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);    // b = (c ? 1 : 0)
        int32x4_t r = vshlq_n_s32(b, 4);                            // r = (b << 4)
        r = vnegq_s32(r);
        int32x4_t v = vshlq_s32(value, r);                          // v = (v >> r)

        c = vcgtq_s32(v, g_XM000000FF);                             // c = (v > 0xFF)
        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
        int32x4_t s = vshlq_n_s32(b, 3);                            // s = (b << 3)
        s = vnegq_s32(s);
        v = vshlq_s32(v, s);                                        // v = (v >> s)
        r = vorrq_s32(r, s);                                        // r = (r | s)

        c = vcgtq_s32(v, g_XM0000000F);                             // c = (v > 0xF)
        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
        s = vshlq_n_s32(b, 2);                                      // s = (b << 2)
        s = vnegq_s32(s);
        v = vshlq_s32(v, s);                                        // v = (v >> s)
        r = vorrq_s32(r, s);                                        // r = (r | s)

        c = vcgtq_s32(v, g_XM00000003);                             // c = (v > 0x3)
        b = vshrq_n_s32(vreinterpretq_s32_u32(c), 31);              // b = (c ? 1 : 0)
        s = vshlq_n_s32(b, 1);                                      // s = (b << 1)
        s = vnegq_s32(s);
        v = vshlq_s32(v, s);                                        // v = (v >> s)
        r = vorrq_s32(r, s);                                        // r = (r | s)

        s = vshrq_n_s32(v, 1);
        r = vorrq_s32(r, s);
        return r;
    }

} // namespace Internal

#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLog2(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            log2f(V.vector4_f32[0]),
            log2f(V.vector4_f32[1]),
            log2f(V.vector4_f32[2]),
            log2f(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
    uint32x4_t isExponentZero = vceqq_s32(vreinterpretq_s32_f32(g_XMZero), rawBiased);

    // Compute exponent and significand for normals.
    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
    int32x4_t trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    int32x4_t leading = Internal::GetLeadingBit(trailing);
    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
    int32x4_t trailingSub = vshlq_s32(trailing, shift);
    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);

    // Compute the approximation.
    int32x4_t tmp = vorrq_s32(vreinterpretq_s32_f32(g_XMOne), t);
    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);

    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);

    uint32x4_t isGreaterZero = vcgtq_f32(V, g_XMZero);
    uint32x4_t isNotFinite = vcgtq_f32(V, g_XMInfinity);
    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);

    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isZero = vceqq_u32(isZero, g_XMZero);

    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
    t0 = vceqq_u32(t0, g_XMZero);
    t1 = vceqq_u32(t1, g_XMInfinity);
    uint32x4_t isNaN = vbicq_u32(t1, t0);

    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
    result = vbslq_f32(isPositive, result, tmp2);
    result = vbslq_f32(isNaN, g_XMQNaN, result);
    return result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_log2_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);

    // Compute exponent and significand for normals.
    __m128i biased = _mm_srli_epi32(rawBiased, 23);
    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
    __m128i trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    __m128i leading = Internal::GetLeadingBit(trailing);
    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);

    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
    __m128i e = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isExponentZero, trailingSub);
    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
    __m128i t = _mm_or_si128(select0, select1);

    // Compute the approximation.
    __m128i tmp = _mm_or_si128(g_XMOne, t);
    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);

    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);

    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);

    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);

    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
    __m128i isNaN = _mm_andnot_si128(t0, t1);

    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
    __m128i result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
    tmp = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isPositive, result);
    select1 = _mm_andnot_si128(isPositive, tmp);
    result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isNaN, g_XMQNaN);
    select1 = _mm_andnot_si128(isNaN, result);
    result = _mm_or_si128(select0, select1);

    return _mm_castsi128_ps(result);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLog10(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            log10f(V.vector4_f32[0]),
            log10f(V.vector4_f32[1]),
            log10f(V.vector4_f32[2]),
            log10f(V.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);

    // Compute exponent and significand for normals.
    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
    int32x4_t trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    int32x4_t leading = Internal::GetLeadingBit(trailing);
    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
    int32x4_t trailingSub = vshlq_s32(trailing, shift);
    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);

    // Compute the approximation.
    int32x4_t tmp = vorrq_s32(g_XMOne, t);
    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);

    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);

    log2 = vmulq_f32(g_XMInvLg10, log2);

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);

    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);

    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isZero = vceqq_u32(isZero, g_XMZero);

    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
    t0 = vceqq_u32(t0, g_XMZero);
    t1 = vceqq_u32(t1, g_XMInfinity);
    uint32x4_t isNaN = vbicq_u32(t1, t0);

    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
    result = vbslq_f32(isPositive, result, tmp2);
    result = vbslq_f32(isNaN, g_XMQNaN, result);
    return result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_log10_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);

    // Compute exponent and significand for normals.
    __m128i biased = _mm_srli_epi32(rawBiased, 23);
    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
    __m128i trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    __m128i leading = Internal::GetLeadingBit(trailing);
    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);

    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
    __m128i e = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isExponentZero, trailingSub);
    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
    __m128i t = _mm_or_si128(select0, select1);

    // Compute the approximation.
    __m128i tmp = _mm_or_si128(g_XMOne, t);
    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);

    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));

    log2 = _mm_mul_ps(g_XMInvLg10, log2);

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);

    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);

    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);

    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
    __m128i isNaN = _mm_andnot_si128(t0, t1);

    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
    __m128i result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
    tmp = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isPositive, result);
    select1 = _mm_andnot_si128(isPositive, tmp);
    result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isNaN, g_XMQNaN);
    select1 = _mm_andnot_si128(isNaN, result);
    result = _mm_or_si128(select0, select1);

    return _mm_castsi128_ps(result);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLogE(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            logf(V.vector4_f32[0]),
            logf(V.vector4_f32[1]),
            logf(V.vector4_f32[2]),
            logf(V.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    int32x4_t rawBiased = vandq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    int32x4_t trailing = vandq_s32(vreinterpretq_s32_f32(V), g_XMQNaNTest);
    uint32x4_t isExponentZero = vceqq_s32(g_XMZero, rawBiased);

    // Compute exponent and significand for normals.
    int32x4_t biased = vshrq_n_s32(rawBiased, 23);
    int32x4_t exponentNor = vsubq_s32(biased, g_XMExponentBias);
    int32x4_t trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    int32x4_t leading = Internal::GetLeadingBit(trailing);
    int32x4_t shift = vsubq_s32(g_XMNumTrailing, leading);
    int32x4_t exponentSub = vsubq_s32(g_XMSubnormalExponent, shift);
    int32x4_t trailingSub = vshlq_s32(trailing, shift);
    trailingSub = vandq_s32(trailingSub, g_XMQNaNTest);
    int32x4_t e = vbslq_s32(isExponentZero, exponentSub, exponentNor);
    int32x4_t t = vbslq_s32(isExponentZero, trailingSub, trailingNor);

    // Compute the approximation.
    int32x4_t tmp = vorrq_s32(g_XMOne, t);
    float32x4_t y = vsubq_f32(vreinterpretq_f32_s32(tmp), g_XMOne);

    float32x4_t log2 = vmlaq_f32(g_XMLogEst6, g_XMLogEst7, y);
    log2 = vmlaq_f32(g_XMLogEst5, log2, y);
    log2 = vmlaq_f32(g_XMLogEst4, log2, y);
    log2 = vmlaq_f32(g_XMLogEst3, log2, y);
    log2 = vmlaq_f32(g_XMLogEst2, log2, y);
    log2 = vmlaq_f32(g_XMLogEst1, log2, y);
    log2 = vmlaq_f32(g_XMLogEst0, log2, y);
    log2 = vmlaq_f32(vcvtq_f32_s32(e), log2, y);

    log2 = vmulq_f32(g_XMInvLgE, log2);

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    uint32x4_t isInfinite = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isInfinite = vceqq_u32(isInfinite, g_XMInfinity);

    uint32x4_t isGreaterZero = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMZero);
    uint32x4_t isNotFinite = vcgtq_s32(vreinterpretq_s32_f32(V), g_XMInfinity);
    uint32x4_t isPositive = vbicq_u32(isGreaterZero, isNotFinite);

    uint32x4_t isZero = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    isZero = vceqq_u32(isZero, g_XMZero);

    uint32x4_t t0 = vandq_u32(vreinterpretq_u32_f32(V), g_XMQNaNTest);
    uint32x4_t t1 = vandq_u32(vreinterpretq_u32_f32(V), g_XMInfinity);
    t0 = vceqq_u32(t0, g_XMZero);
    t1 = vceqq_u32(t1, g_XMInfinity);
    uint32x4_t isNaN = vbicq_u32(t1, t0);

    float32x4_t result = vbslq_f32(isInfinite, g_XMInfinity, log2);
    float32x4_t tmp2 = vbslq_f32(isZero, g_XMNegInfinity, g_XMNegQNaN);
    result = vbslq_f32(isPositive, result, tmp2);
    result = vbslq_f32(isNaN, g_XMQNaN, result);
    return result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_log_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i rawBiased = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    __m128i trailing = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i isExponentZero = _mm_cmpeq_epi32(g_XMZero, rawBiased);

    // Compute exponent and significand for normals.
    __m128i biased = _mm_srli_epi32(rawBiased, 23);
    __m128i exponentNor = _mm_sub_epi32(biased, g_XMExponentBias);
    __m128i trailingNor = trailing;

    // Compute exponent and significand for subnormals.
    __m128i leading = Internal::GetLeadingBit(trailing);
    __m128i shift = _mm_sub_epi32(g_XMNumTrailing, leading);
    __m128i exponentSub = _mm_sub_epi32(g_XMSubnormalExponent, shift);
    __m128i trailingSub = Internal::multi_sll_epi32(trailing, shift);
    trailingSub = _mm_and_si128(trailingSub, g_XMQNaNTest);

    __m128i select0 = _mm_and_si128(isExponentZero, exponentSub);
    __m128i select1 = _mm_andnot_si128(isExponentZero, exponentNor);
    __m128i e = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isExponentZero, trailingSub);
    select1 = _mm_andnot_si128(isExponentZero, trailingNor);
    __m128i t = _mm_or_si128(select0, select1);

    // Compute the approximation.
    __m128i tmp = _mm_or_si128(g_XMOne, t);
    __m128 y = _mm_sub_ps(_mm_castsi128_ps(tmp), g_XMOne);

    __m128 log2 = XM_FMADD_PS(g_XMLogEst7, y, g_XMLogEst6);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst5);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst4);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst3);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst2);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst1);
    log2 = XM_FMADD_PS(log2, y, g_XMLogEst0);
    log2 = XM_FMADD_PS(log2, y, _mm_cvtepi32_ps(e));

    log2 = _mm_mul_ps(g_XMInvLgE, log2);

    //  if (x is NaN) -> QNaN
    //  else if (V is positive)
    //      if (V is infinite) -> +inf
    //      else -> log2(V)
    //  else
    //      if (V is zero) -> -inf
    //      else -> -QNaN

    __m128i isInfinite = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isInfinite = _mm_cmpeq_epi32(isInfinite, g_XMInfinity);

    __m128i isGreaterZero = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMZero);
    __m128i isNotFinite = _mm_cmpgt_epi32(_mm_castps_si128(V), g_XMInfinity);
    __m128i isPositive = _mm_andnot_si128(isNotFinite, isGreaterZero);

    __m128i isZero = _mm_and_si128(_mm_castps_si128(V), g_XMAbsMask);
    isZero = _mm_cmpeq_epi32(isZero, g_XMZero);

    __m128i t0 = _mm_and_si128(_mm_castps_si128(V), g_XMQNaNTest);
    __m128i t1 = _mm_and_si128(_mm_castps_si128(V), g_XMInfinity);
    t0 = _mm_cmpeq_epi32(t0, g_XMZero);
    t1 = _mm_cmpeq_epi32(t1, g_XMInfinity);
    __m128i isNaN = _mm_andnot_si128(t0, t1);

    select0 = _mm_and_si128(isInfinite, g_XMInfinity);
    select1 = _mm_andnot_si128(isInfinite, _mm_castps_si128(log2));
    __m128i result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isZero, g_XMNegInfinity);
    select1 = _mm_andnot_si128(isZero, g_XMNegQNaN);
    tmp = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isPositive, result);
    select1 = _mm_andnot_si128(isPositive, tmp);
    result = _mm_or_si128(select0, select1);

    select0 = _mm_and_si128(isNaN, g_XMQNaN);
    select1 = _mm_andnot_si128(isNaN, result);
    result = _mm_or_si128(select0, select1);

    return _mm_castsi128_ps(result);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLog(FXMVECTOR V) noexcept
{
    return XMVectorLog2(V);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorPow
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            powf(V1.vector4_f32[0], V2.vector4_f32[0]),
            powf(V1.vector4_f32[1], V2.vector4_f32[1]),
            powf(V1.vector4_f32[2], V2.vector4_f32[2]),
            powf(V1.vector4_f32[3], V2.vector4_f32[3])
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTORF32 vResult = { { {
            powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
            powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
            powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
            powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
        } } };
    return vResult.v;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_pow_ps(V1, V2);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    XM_ALIGNED_DATA(16) float a[4];
    XM_ALIGNED_DATA(16) float b[4];
    _mm_store_ps(a, V1);
    _mm_store_ps(b, V2);
    XMVECTOR vResult = _mm_setr_ps(
        powf(a[0], b[0]),
        powf(a[1], b[1]),
        powf(a[2], b[2]),
        powf(a[3], b[3]));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorAbs(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { {
            fabsf(V.vector4_f32[0]),
            fabsf(V.vector4_f32[1]),
            fabsf(V.vector4_f32[2]),
            fabsf(V.vector4_f32[3])
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    return vabsq_f32(V);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_setzero_ps();
    vResult = _mm_sub_ps(vResult, V);
    vResult = _mm_max_ps(vResult, V);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorMod
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    // V1 % V2 = V1 - V2 * truncate(V1 / V2)

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Quotient = XMVectorDivide(V1, V2);
    Quotient = XMVectorTruncate(Quotient);
    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR vResult = XMVectorDivide(V1, V2);
    vResult = XMVectorTruncate(vResult);
    return vmlsq_f32(V1, vResult, V2);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = _mm_div_ps(V1, V2);
    vResult = XMVectorTruncate(vResult);
    return XM_FNMADD_PS(vResult, V2, V1);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorModAngles(FXMVECTOR Angles) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR V;
    XMVECTOR Result;

    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
    V = XMVectorRound(V);
    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
    XMVECTOR vResult = vmulq_f32(Angles, g_XMReciprocalTwoPi);
    // Use the inline function due to complexity for rounding
    vResult = XMVectorRound(vResult);
    return vmlsq_f32(Angles, vResult, g_XMTwoPi);
#elif defined(_XM_SSE_INTRINSICS_)
    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
    XMVECTOR vResult = _mm_mul_ps(Angles, g_XMReciprocalTwoPi);
    // Use the inline function due to complexity for rounding
    vResult = XMVectorRound(vResult);
    return XM_FNMADD_PS(vResult, g_XMTwoPi, Angles);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSin(FXMVECTOR V) noexcept
{
    // 11-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            sinf(V.vector4_f32[0]),
            sinf(V.vector4_f32[1]),
            sinf(V.vector4_f32[2]),
            sinf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR SC1 = g_XMSinCoefficients1;
    const XMVECTOR SC0 = g_XMSinCoefficients0;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);

    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, x);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_sin_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR SC1 = g_XMSinCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
    const XMVECTOR SC0 = g_XMSinCoefficients0;
    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, x);
    return Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCos(FXMVECTOR V) noexcept
{
    // 10-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            cosf(V.vector4_f32[0]),
            cosf(V.vector4_f32[1]),
            cosf(V.vector4_f32[2]),
            cosf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Map V to x in [-pi,pi].
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);
    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR CC1 = g_XMCosCoefficients1;
    const XMVECTOR CC0 = g_XMCosCoefficients0;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);

    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, fsign);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_cos_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    // Map V to x in [-pi,pi].
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, g_XMOne);
    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    sign = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR CC1 = g_XMCosCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
    const XMVECTOR CC0 = g_XMCosCoefficients0;
    __m128 vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, sign);
    return Result;
#endif
}

//------------------------------------------------------------------------------


inline void XM_CALLCONV XMVectorSinCos
(
    XMVECTOR* pSin,
    XMVECTOR* pCos,
    FXMVECTOR V
) noexcept
{
    assert(pSin != nullptr);
    assert(pCos != nullptr);

    // 11/10-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Sin = { { {
            sinf(V.vector4_f32[0]),
            sinf(V.vector4_f32[1]),
            sinf(V.vector4_f32[2]),
            sinf(V.vector4_f32[3])
        } } };

    XMVECTORF32 Cos = { { {
            cosf(V.vector4_f32[0]),
            cosf(V.vector4_f32[1]),
            cosf(V.vector4_f32[2]),
            cosf(V.vector4_f32[3])
        } } };

    *pSin = Sin.v;
    *pCos = Cos.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t  rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);
    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation for sine
    const XMVECTOR SC1 = g_XMSinCoefficients1;
    const XMVECTOR SC0 = g_XMSinCoefficients0;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(SC1), 0);

    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    *pSin = vmulq_f32(Result, x);

    // Compute polynomial approximation for cosine
    const XMVECTOR CC1 = g_XMCosCoefficients1;
    const XMVECTOR CC0 = g_XMCosCoefficients0;
    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
    Result = vmlaq_lane_f32(vConstants, x2, vget_low_f32(CC1), 0);

    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    *pCos = vmulq_f32(Result, fsign);
#elif defined(_XM_SVML_INTRINSICS_)
    *pSin = _mm_sincos_ps(pCos, V);
#elif defined(_XM_SSE_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, g_XMOne);
    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    sign = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation of sine
    const XMVECTOR SC1 = g_XMSinCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(SC1, _MM_SHUFFLE(0, 0, 0, 0));
    const XMVECTOR SC0 = g_XMSinCoefficients0;
    __m128 vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, x);
    *pSin = Result;

    // Compute polynomial approximation of cosine
    const XMVECTOR CC1 = g_XMCosCoefficients1;
    vConstantsB = XM_PERMUTE_PS(CC1, _MM_SHUFFLE(0, 0, 0, 0));
    const XMVECTOR CC0 = g_XMCosCoefficients0;
    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(3, 3, 3, 3));
    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, sign);
    *pCos = Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorTan(FXMVECTOR V) noexcept
{
    // Cody and Waite algorithm to compute tangent.

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            tanf(V.vector4_f32[0]),
            tanf(V.vector4_f32[1]),
            tanf(V.vector4_f32[2]),
            tanf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_tan_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    static const XMVECTORF32 TanCoefficients0 = { { { 1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f } } };
    static const XMVECTORF32 TanCoefficients1 = { { { 4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f } } };
    static const XMVECTORF32 TanConstants = { { { 1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ } } };
    static const XMVECTORU32 Mask = { { { 0x1, 0x1, 0x1, 0x1 } } };

    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);

    XMVECTOR Zero = XMVectorZero();

    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);

    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);

    VA = XMVectorRound(VA);

    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);

    XMVECTOR VB = XMVectorAbs(VA);

    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    VB = vreinterpretq_f32_u32(vcvtq_u32_f32(VB));
#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    reinterpret_cast<__m128i*>(&VB)[0] = _mm_cvttps_epi32(VB);
#else
    for (size_t i = 0; i < 4; i++)
    {
        VB.vector4_u32[i] = static_cast<uint32_t>(VB.vector4_f32[i]);
    }
#endif

    XMVECTOR VC2 = XMVectorMultiply(VC, VC);

    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);

    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);

    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
    N = XMVectorMultiplyAdd(VC2, N, T5);
    D = XMVectorMultiplyAdd(VC2, D, T2);
    N = XMVectorMultiply(VC2, N);
    D = XMVectorMultiplyAdd(VC2, D, T1);
    N = XMVectorMultiplyAdd(VC, N, VC);
    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
    D = XMVectorMultiplyAdd(VC2, D, T0);

    N = XMVectorSelect(N, VC, VCNearZero);
    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);

    XMVECTOR R0 = XMVectorNegate(N);
    XMVECTOR R1 = XMVectorDivide(N, D);
    R0 = XMVectorDivide(D, R0);

    XMVECTOR VIsZero = XMVectorEqual(V, Zero);

    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);

    Result = XMVectorSelect(Result, Zero, VIsZero);

    return Result;

#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSinH(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            sinhf(V.vector4_f32[0]),
            sinhf(V.vector4_f32[1]),
            sinhf(V.vector4_f32[2]),
            sinhf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)

    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
    XMVECTOR E1 = XMVectorExp(V1);
    XMVECTOR E2 = XMVectorExp(V2);

    return vsubq_f32(E1, E2);
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_sinh_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)

    XMVECTOR V1 = XM_FMADD_PS(V, Scale, g_XMNegativeOne);
    XMVECTOR V2 = XM_FNMADD_PS(V, Scale, g_XMNegativeOne);
    XMVECTOR E1 = XMVectorExp(V1);
    XMVECTOR E2 = XMVectorExp(V2);

    return _mm_sub_ps(E1, E2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCosH(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            coshf(V.vector4_f32[0]),
            coshf(V.vector4_f32[1]),
            coshf(V.vector4_f32[2]),
            coshf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)

    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
    XMVECTOR E1 = XMVectorExp(V1);
    XMVECTOR E2 = XMVectorExp(V2);
    return vaddq_f32(E1, E2);
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_cosh_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f } } }; // 1.0f / ln(2.0f)

    XMVECTOR V1 = XM_FMADD_PS(V, Scale.v, g_XMNegativeOne.v);
    XMVECTOR V2 = XM_FNMADD_PS(V, Scale.v, g_XMNegativeOne.v);
    XMVECTOR E1 = XMVectorExp(V1);
    XMVECTOR E2 = XMVectorExp(V2);
    return _mm_add_ps(E1, E2);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorTanH(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            tanhf(V.vector4_f32[0]),
            tanhf(V.vector4_f32[1]),
            tanhf(V.vector4_f32[2]),
            tanhf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)

    XMVECTOR E = vmulq_f32(V, Scale.v);
    E = XMVectorExp(E);
    E = vmlaq_f32(g_XMOneHalf.v, E, g_XMOneHalf.v);
    E = XMVectorReciprocal(E);
    return vsubq_f32(g_XMOne.v, E);
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_tanh_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 Scale = { { { 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f } } }; // 2.0f / ln(2.0f)

    XMVECTOR E = _mm_mul_ps(V, Scale.v);
    E = XMVectorExp(E);
    E = XM_FMADD_PS(E, g_XMOneHalf.v, g_XMOneHalf.v);
    E = _mm_div_ps(g_XMOne.v, E);
    return _mm_sub_ps(g_XMOne.v, E);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorASin(FXMVECTOR V) noexcept
{
    // 7-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            asinf(V.vector4_f32[0]),
            asinf(V.vector4_f32[1]),
            asinf(V.vector4_f32[2]),
            asinf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
    float32x4_t x = vabsq_f32(V);

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
    float32x4_t root = XMVectorSqrt(clampOneMValue);

    // Compute polynomial approximation
    const XMVECTOR AC1 = g_XMArcCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
    t0 = vmlaq_f32(vConstants, t0, x);

    const XMVECTOR AC0 = g_XMArcCoefficients0;
    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
    t0 = vmlaq_f32(vConstants, t0, x);
    t0 = vmulq_f32(t0, root);

    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
    t0 = vbslq_f32(nonnegative, t0, t1);
    t0 = vsubq_f32(g_XMHalfPi, t0);
    return t0;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_asin_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
    __m128 x = _mm_max_ps(V, mvalue);  // |V|

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)

    // Compute polynomial approximation
    const XMVECTOR AC1 = g_XMArcCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    const XMVECTOR AC0 = g_XMArcCoefficients0;
    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);
    t0 = _mm_mul_ps(t0, root);

    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
    t0 = _mm_and_ps(nonnegative, t0);
    t1 = _mm_andnot_ps(nonnegative, t1);
    t0 = _mm_or_ps(t0, t1);
    t0 = _mm_sub_ps(g_XMHalfPi, t0);
    return t0;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorACos(FXMVECTOR V) noexcept
{
    // 7-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            acosf(V.vector4_f32[0]),
            acosf(V.vector4_f32[1]),
            acosf(V.vector4_f32[2]),
            acosf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
    float32x4_t x = vabsq_f32(V);

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
    float32x4_t root = XMVectorSqrt(clampOneMValue);

    // Compute polynomial approximation
    const XMVECTOR AC1 = g_XMArcCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AC1), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
    t0 = vmlaq_f32(vConstants, t0, x);

    const XMVECTOR AC0 = g_XMArcCoefficients0;
    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
    t0 = vmlaq_f32(vConstants, t0, x);
    t0 = vmulq_f32(t0, root);

    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
    t0 = vbslq_f32(nonnegative, t0, t1);
    return t0;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_acos_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
    __m128 x = _mm_max_ps(V, mvalue);  // |V|

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)

    // Compute polynomial approximation
    const XMVECTOR AC1 = g_XMArcCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC1, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    const XMVECTOR AC0 = g_XMArcCoefficients0;
    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(3, 3, 3, 3));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(2, 2, 2, 2));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AC0, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);
    t0 = _mm_mul_ps(t0, root);

    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
    t0 = _mm_and_ps(nonnegative, t0);
    t1 = _mm_andnot_ps(nonnegative, t1);
    t0 = _mm_or_ps(t0, t1);
    return t0;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept
{
    // 17-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            atanf(V.vector4_f32[0]),
            atanf(V.vector4_f32[1]),
            atanf(V.vector4_f32[2]),
            atanf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t absV = vabsq_f32(V);
    float32x4_t invV = XMVectorReciprocal(V);
    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
    comp = vcleq_f32(absV, g_XMOne);
    sign = vbslq_f32(comp, g_XMZero, sign);
    float32x4_t x = vbslq_f32(comp, V, invV);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR TC1 = g_XMATanCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    const XMVECTOR TC0 = g_XMATanCoefficients0;
    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, x);

    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
    result1 = vsubq_f32(result1, Result);

    comp = vceqq_f32(sign, g_XMZero);
    Result = vbslq_f32(comp, Result, result1);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_atan_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 absV = XMVectorAbs(V);
    __m128 invV = _mm_div_ps(g_XMOne, V);
    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
    __m128 select0 = _mm_and_ps(comp, g_XMOne);
    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    __m128 sign = _mm_or_ps(select0, select1);
    comp = _mm_cmple_ps(absV, g_XMOne);
    select0 = _mm_and_ps(comp, g_XMZero);
    select1 = _mm_andnot_ps(comp, sign);
    sign = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, V);
    select1 = _mm_andnot_ps(comp, invV);
    __m128 x = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR TC1 = g_XMATanCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    const XMVECTOR TC0 = g_XMATanCoefficients0;
    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);

    Result = _mm_mul_ps(Result, x);
    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
    result1 = _mm_sub_ps(result1, Result);

    comp = _mm_cmpeq_ps(sign, g_XMZero);
    select0 = _mm_and_ps(comp, Result);
    select1 = _mm_andnot_ps(comp, result1);
    Result = _mm_or_ps(select0, select1);
    return Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorATan2
(
    FXMVECTOR Y,
    FXMVECTOR X
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
            atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
            atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
            atan2f(Y.vector4_f32[3], X.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_atan2_ps(Y, X);
    return Result;
#else

    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:

    //     Y == 0 and X is Negative         -> Pi with the sign of Y
    //     y == 0 and x is positive         -> 0 with the sign of y
    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y

    static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f } } };

    XMVECTOR Zero = XMVectorZero();
    XMVECTOR ATanResultValid = XMVectorTrueInt();

    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);

    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);

    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
    Pi = XMVectorOrInt(Pi, YSign);
    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);

    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);

    XMVECTOR V = XMVectorDivide(Y, X);

    XMVECTOR R0 = XMVectorATan(V);

    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
    R2 = XMVectorAdd(R0, R1);

    return XMVectorSelect(Result, R2, ATanResultValid);

#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorSinEst(FXMVECTOR V) noexcept
{
    // 7-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            sinf(V.vector4_f32[0]),
            sinf(V.vector4_f32[1]),
            sinf(V.vector4_f32[2]),
            sinf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR SEC = g_XMSinCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, x);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_sin_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR SEC = g_XMSinCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);
    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, x);
    return Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCosEst(FXMVECTOR V) noexcept
{
    // 6-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            cosf(V.vector4_f32[0]),
            cosf(V.vector4_f32[1]),
            cosf(V.vector4_f32[2]),
            cosf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Map V to x in [-pi,pi].
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);
    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR CEC = g_XMCosCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, fsign);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_cos_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    // Map V to x in [-pi,pi].
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, g_XMOne);
    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    sign = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR CEC = g_XMCosCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);
    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, sign);
    return Result;
#endif
}

//------------------------------------------------------------------------------


inline void XM_CALLCONV XMVectorSinCosEst
(
    XMVECTOR* pSin,
    XMVECTOR* pCos,
    FXMVECTOR  V
) noexcept
{
    assert(pSin != nullptr);
    assert(pCos != nullptr);

    // 7/6-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Sin = { { {
            sinf(V.vector4_f32[0]),
            sinf(V.vector4_f32[1]),
            sinf(V.vector4_f32[2]),
            sinf(V.vector4_f32[3])
        } } };

    XMVECTORF32 Cos = { { {
            cosf(V.vector4_f32[0]),
            cosf(V.vector4_f32[1]),
            cosf(V.vector4_f32[2]),
            cosf(V.vector4_f32[3])
        } } };

    *pSin = Sin.v;
    *pCos = Cos.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), g_XMNegativeZero);
    uint32x4_t c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    float32x4_t absx = vabsq_f32(x);
    float32x4_t rflx = vsubq_f32(vreinterpretq_f32_u32(c), x);
    uint32x4_t comp = vcleq_f32(absx, g_XMHalfPi);
    x = vbslq_f32(comp, x, rflx);
    float32x4_t fsign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation for sine
    const XMVECTOR SEC = g_XMSinCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(SEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    *pSin = vmulq_f32(Result, x);

    // Compute polynomial approximation
    const XMVECTOR CEC = g_XMCosCoefficients1;
    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
    Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(CEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    *pCos = vmulq_f32(Result, fsign);
#elif defined(_XM_SSE_INTRINSICS_)
    // Force the value within the bounds of pi
    XMVECTOR x = XMVectorModAngles(V);

    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
    __m128 rflx = _mm_sub_ps(c, x);
    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
    __m128 select0 = _mm_and_ps(comp, x);
    __m128 select1 = _mm_andnot_ps(comp, rflx);
    x = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, g_XMOne);
    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    sign = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation for sine
    const XMVECTOR SEC = g_XMSinCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(SEC, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);
    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, x);
    *pSin = Result;

    // Compute polynomial approximation for cosine
    const XMVECTOR CEC = g_XMCosCoefficients1;
    vConstantsB = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(3, 3, 3, 3));
    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(CEC, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);
    Result = XM_FMADD_PS(Result, x2, g_XMOne);
    Result = _mm_mul_ps(Result, sign);
    *pCos = Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorTanEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            tanf(V.vector4_f32[0]),
            tanf(V.vector4_f32[1]),
            tanf(V.vector4_f32[2]),
            tanf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_tan_ps(V);
    return Result;
#else

    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);

    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
    V1 = XMVectorRound(V1);

    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);

    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);

    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
    XMVECTOR V2 = XMVectorMultiply(V1, V1);
    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);

    XMVECTOR D = XMVectorReciprocalEst(V2T2);
    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);

    return XMVectorMultiply(N, D);

#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorASinEst(FXMVECTOR V) noexcept
{
    // 3-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result;
    Result.f[0] = asinf(V.vector4_f32[0]);
    Result.f[1] = asinf(V.vector4_f32[1]);
    Result.f[2] = asinf(V.vector4_f32[2]);
    Result.f[3] = asinf(V.vector4_f32[3]);
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
    float32x4_t x = vabsq_f32(V);

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
    float32x4_t root = XMVectorSqrt(clampOneMValue);

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMArcEstCoefficients;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
    t0 = vmlaq_f32(vConstants, t0, x);
    t0 = vmulq_f32(t0, root);

    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
    t0 = vbslq_f32(nonnegative, t0, t1);
    t0 = vsubq_f32(g_XMHalfPi, t0);
    return t0;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_asin_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
    __m128 x = _mm_max_ps(V, mvalue);  // |V|

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMArcEstCoefficients;
    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);
    t0 = _mm_mul_ps(t0, root);

    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
    t0 = _mm_and_ps(nonnegative, t0);
    t1 = _mm_andnot_ps(nonnegative, t1);
    t0 = _mm_or_ps(t0, t1);
    t0 = _mm_sub_ps(g_XMHalfPi, t0);
    return t0;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorACosEst(FXMVECTOR V) noexcept
{
    // 3-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            acosf(V.vector4_f32[0]),
            acosf(V.vector4_f32[1]),
            acosf(V.vector4_f32[2]),
            acosf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t nonnegative = vcgeq_f32(V, g_XMZero);
    float32x4_t x = vabsq_f32(V);

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    float32x4_t oneMValue = vsubq_f32(g_XMOne, x);
    float32x4_t clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
    float32x4_t root = XMVectorSqrt(clampOneMValue);

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMArcEstCoefficients;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
    XMVECTOR t0 = vmlaq_lane_f32(vConstants, x, vget_high_f32(AEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
    t0 = vmlaq_f32(vConstants, t0, x);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
    t0 = vmlaq_f32(vConstants, t0, x);
    t0 = vmulq_f32(t0, root);

    float32x4_t t1 = vsubq_f32(g_XMPi, t0);
    t0 = vbslq_f32(nonnegative, t0, t1);
    return t0;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_acos_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
    __m128 x = _mm_max_ps(V, mvalue);  // |V|

    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMArcEstCoefficients;
    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 t0 = XM_FMADD_PS(vConstantsB, x, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
    t0 = XM_FMADD_PS(t0, x, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
    t0 = XM_FMADD_PS(t0, x, vConstants);
    t0 = _mm_mul_ps(t0, root);

    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
    t0 = _mm_and_ps(nonnegative, t0);
    t1 = _mm_andnot_ps(nonnegative, t1);
    t0 = _mm_or_ps(t0, t1);
    return t0;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorATanEst(FXMVECTOR V) noexcept
{
    // 9-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            atanf(V.vector4_f32[0]),
            atanf(V.vector4_f32[1]),
            atanf(V.vector4_f32[2]),
            atanf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t absV = vabsq_f32(V);
    float32x4_t invV = XMVectorReciprocalEst(V);
    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
    comp = vcleq_f32(absV, g_XMOne);
    sign = vbslq_f32(comp, g_XMZero, sign);
    float32x4_t x = vbslq_f32(comp, V, invV);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMATanEstCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(AEC), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    // ATanEstCoefficients0 is already splatted
    Result = vmlaq_f32(g_XMATanEstCoefficients0, Result, x2);
    Result = vmulq_f32(Result, x);

    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
    result1 = vsubq_f32(result1, Result);

    comp = vceqq_f32(sign, g_XMZero);
    Result = vbslq_f32(comp, Result, result1);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_atan_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 absV = XMVectorAbs(V);
    __m128 invV = _mm_div_ps(g_XMOne, V);
    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
    __m128 select0 = _mm_and_ps(comp, g_XMOne);
    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    __m128 sign = _mm_or_ps(select0, select1);
    comp = _mm_cmple_ps(absV, g_XMOne);
    select0 = _mm_and_ps(comp, g_XMZero);
    select1 = _mm_andnot_ps(comp, sign);
    sign = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, V);
    select1 = _mm_andnot_ps(comp, invV);
    __m128 x = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR AEC = g_XMATanEstCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(AEC, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);
    // ATanEstCoefficients0 is already splatted
    Result = XM_FMADD_PS(Result, x2, g_XMATanEstCoefficients0);
    Result = _mm_mul_ps(Result, x);
    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
    result1 = _mm_sub_ps(result1, Result);

    comp = _mm_cmpeq_ps(sign, g_XMZero);
    select0 = _mm_and_ps(comp, Result);
    select1 = _mm_andnot_ps(comp, result1);
    Result = _mm_or_ps(select0, select1);
    return Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorATan2Est
(
    FXMVECTOR Y,
    FXMVECTOR X
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            atan2f(Y.vector4_f32[0], X.vector4_f32[0]),
            atan2f(Y.vector4_f32[1], X.vector4_f32[1]),
            atan2f(Y.vector4_f32[2], X.vector4_f32[2]),
            atan2f(Y.vector4_f32[3], X.vector4_f32[3]),
        } } };
    return Result.v;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_atan2_ps(Y, X);
    return Result;
#else

    static const XMVECTORF32 ATan2Constants = { { { XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */ } } };

    const XMVECTOR Zero = XMVectorZero();
    XMVECTOR ATanResultValid = XMVectorTrueInt();

    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);

    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);

    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
    Pi = XMVectorOrInt(Pi, YSign);
    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);

    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);

    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
    XMVECTOR R0 = XMVectorATanEst(V);

    R1 = XMVectorSelect(Pi, g_XMNegativeZero, XIsPositive);
    R2 = XMVectorAdd(R0, R1);

    Result = XMVectorSelect(Result, R2, ATanResultValid);

    return Result;

#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLerp
(
    FXMVECTOR V0,
    FXMVECTOR V1,
    float    t
) noexcept
{
    // V0 + t * (V1 - V0)

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Scale = XMVectorReplicate(t);
    XMVECTOR Length = XMVectorSubtract(V1, V0);
    return XMVectorMultiplyAdd(Length, Scale, V0);

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR L = vsubq_f32(V1, V0);
    return vmlaq_n_f32(V0, L, t);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR L = _mm_sub_ps(V1, V0);
    XMVECTOR S = _mm_set_ps1(t);
    return XM_FMADD_PS(L, S, V0);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorLerpV
(
    FXMVECTOR V0,
    FXMVECTOR V1,
    FXMVECTOR T
) noexcept
{
    // V0 + T * (V1 - V0)

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Length = XMVectorSubtract(V1, V0);
    return XMVectorMultiplyAdd(Length, T, V0);

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR L = vsubq_f32(V1, V0);
    return vmlaq_f32(V0, L, T);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR Length = _mm_sub_ps(V1, V0);
    return XM_FMADD_PS(Length, T, V0);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorHermite
(
    FXMVECTOR Position0,
    FXMVECTOR Tangent0,
    FXMVECTOR Position1,
    GXMVECTOR Tangent1,
    float    t
) noexcept
{
    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
    //          (t^3 - 2 * t^2 + t) * Tangent0 +
    //          (-2 * t^3 + 3 * t^2) * Position1 +
    //          (t^3 - t^2) * Tangent1

#if defined(_XM_NO_INTRINSICS_)

    float t2 = t * t;
    float t3 = t * t2;

    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
    XMVECTOR T1 = XMVectorReplicate(t3 - t2);

    XMVECTOR Result = XMVectorMultiply(P0, Position0);
    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
    Result = XMVectorMultiplyAdd(P1, Position1, Result);
    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float t2 = t * t;
    float t3 = t * t2;

    float p0 = 2.0f * t3 - 3.0f * t2 + 1.0f;
    float t0 = t3 - 2.0f * t2 + t;
    float p1 = -2.0f * t3 + 3.0f * t2;
    float t1 = t3 - t2;

    XMVECTOR vResult = vmulq_n_f32(Position0, p0);
    vResult = vmlaq_n_f32(vResult, Tangent0, t0);
    vResult = vmlaq_n_f32(vResult, Position1, p1);
    vResult = vmlaq_n_f32(vResult, Tangent1, t1);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    float t2 = t * t;
    float t3 = t * t2;

    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
    XMVECTOR T1 = _mm_set_ps1(t3 - t2);

    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
    vResult = XM_FMADD_PS(Tangent0, T0, vResult);
    vResult = XM_FMADD_PS(Position1, P1, vResult);
    vResult = XM_FMADD_PS(Tangent1, T1, vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorHermiteV
(
    FXMVECTOR Position0,
    FXMVECTOR Tangent0,
    FXMVECTOR Position1,
    GXMVECTOR Tangent1,
    HXMVECTOR T
) noexcept
{
    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
    //          (t^3 - 2 * t^2 + t) * Tangent0 +
    //          (-2 * t^3 + 3 * t^2) * Position1 +
    //          (t^3 - t^2) * Tangent1

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR T2 = XMVectorMultiply(T, T);
    XMVECTOR T3 = XMVectorMultiply(T, T2);

    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
    XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
    XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);

    XMVECTOR Result = XMVectorMultiply(P0, Position0);
    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
    Result = XMVectorMultiplyAdd(P1, Position1, Result);
    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
    static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };

    XMVECTOR T2 = vmulq_f32(T, T);
    XMVECTOR T3 = vmulq_f32(T, T2);
    // Mul by the constants against t^2
    T2 = vmulq_f32(T2, CatMulT2);
    // Mul by the constants against t^3
    T3 = vmlaq_f32(T2, T3, CatMulT3);
    // T3 now has the pre-result.
    // I need to add t.y only
    T2 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T), g_XMMaskY));
    T3 = vaddq_f32(T3, T2);
    // Add 1.0f to x
    T3 = vaddq_f32(T3, g_XMIdentityR0);
    // Now, I have the constants created
    // Mul the x constant to Position0
    XMVECTOR vResult = vmulq_lane_f32(Position0, vget_low_f32(T3), 0); // T3[0]
    // Mul the y constant to Tangent0
    vResult = vmlaq_lane_f32(vResult, Tangent0, vget_low_f32(T3), 1); // T3[1]
    // Mul the z constant to Position1
    vResult = vmlaq_lane_f32(vResult, Position1, vget_high_f32(T3), 0); // T3[2]
    // Mul the w constant to Tangent1
    vResult = vmlaq_lane_f32(vResult, Tangent1, vget_high_f32(T3), 1); // T3[3]
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 CatMulT2 = { { { -3.0f, -2.0f, 3.0f, -1.0f } } };
    static const XMVECTORF32 CatMulT3 = { { { 2.0f, 1.0f, -2.0f, 1.0f } } };

    XMVECTOR T2 = _mm_mul_ps(T, T);
    XMVECTOR T3 = _mm_mul_ps(T, T2);
    // Mul by the constants against t^2
    T2 = _mm_mul_ps(T2, CatMulT2);
    // Mul by the constants against t^3
    T3 = XM_FMADD_PS(T3, CatMulT3, T2);
    // T3 now has the pre-result.
    // I need to add t.y only
    T2 = _mm_and_ps(T, g_XMMaskY);
    T3 = _mm_add_ps(T3, T2);
    // Add 1.0f to x
    T3 = _mm_add_ps(T3, g_XMIdentityR0);
    // Now, I have the constants created
    // Mul the x constant to Position0
    XMVECTOR vResult = XM_PERMUTE_PS(T3, _MM_SHUFFLE(0, 0, 0, 0));
    vResult = _mm_mul_ps(vResult, Position0);
    // Mul the y constant to Tangent0
    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(1, 1, 1, 1));
    vResult = XM_FMADD_PS(T2, Tangent0, vResult);
    // Mul the z constant to Position1
    T2 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(2, 2, 2, 2));
    vResult = XM_FMADD_PS(T2, Position1, vResult);
    // Mul the w constant to Tangent1
    T3 = XM_PERMUTE_PS(T3, _MM_SHUFFLE(3, 3, 3, 3));
    vResult = XM_FMADD_PS(T3, Tangent1, vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCatmullRom
(
    FXMVECTOR Position0,
    FXMVECTOR Position1,
    FXMVECTOR Position2,
    GXMVECTOR Position3,
    float    t
) noexcept
{
    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
    //           (t^3 - t^2) * Position3) * 0.5

#if defined(_XM_NO_INTRINSICS_)

    float t2 = t * t;
    float t3 = t * t2;

    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);

    XMVECTOR Result = XMVectorMultiply(P0, Position0);
    Result = XMVectorMultiplyAdd(P1, Position1, Result);
    Result = XMVectorMultiplyAdd(P2, Position2, Result);
    Result = XMVectorMultiplyAdd(P3, Position3, Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float t2 = t * t;
    float t3 = t * t2;

    float p0 = (-t3 + 2.0f * t2 - t) * 0.5f;
    float p1 = (3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f;
    float p2 = (-3.0f * t3 + 4.0f * t2 + t) * 0.5f;
    float p3 = (t3 - t2) * 0.5f;

    XMVECTOR P1 = vmulq_n_f32(Position1, p1);
    XMVECTOR P0 = vmlaq_n_f32(P1, Position0, p0);
    XMVECTOR P3 = vmulq_n_f32(Position3, p3);
    XMVECTOR P2 = vmlaq_n_f32(P3, Position2, p2);
    P0 = vaddq_f32(P0, P2);
    return P0;
#elif defined(_XM_SSE_INTRINSICS_)
    float t2 = t * t;
    float t3 = t * t2;

    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);

    P1 = _mm_mul_ps(Position1, P1);
    P0 = XM_FMADD_PS(Position0, P0, P1);
    P3 = _mm_mul_ps(Position3, P3);
    P2 = XM_FMADD_PS(Position2, P2, P3);
    P0 = _mm_add_ps(P0, P2);
    return P0;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorCatmullRomV
(
    FXMVECTOR Position0,
    FXMVECTOR Position1,
    FXMVECTOR Position2,
    GXMVECTOR Position3,
    HXMVECTOR T
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float fx = T.vector4_f32[0];
    float fy = T.vector4_f32[1];
    float fz = T.vector4_f32[2];
    float fw = T.vector4_f32[3];
    XMVECTORF32 vResult = { { {
            0.5f * ((-fx * fx * fx + 2 * fx * fx - fx) * Position0.vector4_f32[0]
            + (3 * fx * fx * fx - 5 * fx * fx + 2) * Position1.vector4_f32[0]
            + (-3 * fx * fx * fx + 4 * fx * fx + fx) * Position2.vector4_f32[0]
            + (fx * fx * fx - fx * fx) * Position3.vector4_f32[0]),

            0.5f * ((-fy * fy * fy + 2 * fy * fy - fy) * Position0.vector4_f32[1]
            + (3 * fy * fy * fy - 5 * fy * fy + 2) * Position1.vector4_f32[1]
            + (-3 * fy * fy * fy + 4 * fy * fy + fy) * Position2.vector4_f32[1]
            + (fy * fy * fy - fy * fy) * Position3.vector4_f32[1]),

            0.5f * ((-fz * fz * fz + 2 * fz * fz - fz) * Position0.vector4_f32[2]
            + (3 * fz * fz * fz - 5 * fz * fz + 2) * Position1.vector4_f32[2]
            + (-3 * fz * fz * fz + 4 * fz * fz + fz) * Position2.vector4_f32[2]
            + (fz * fz * fz - fz * fz) * Position3.vector4_f32[2]),

            0.5f * ((-fw * fw * fw + 2 * fw * fw - fw) * Position0.vector4_f32[3]
            + (3 * fw * fw * fw - 5 * fw * fw + 2) * Position1.vector4_f32[3]
            + (-3 * fw * fw * fw + 4 * fw * fw + fw) * Position2.vector4_f32[3]
            + (fw * fw * fw - fw * fw) * Position3.vector4_f32[3])
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
    static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
    static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
    static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
    // Cache T^2 and T^3
    XMVECTOR T2 = vmulq_f32(T, T);
    XMVECTOR T3 = vmulq_f32(T, T2);
    // Perform the Position0 term
    XMVECTOR vResult = vaddq_f32(T2, T2);
    vResult = vsubq_f32(vResult, T);
    vResult = vsubq_f32(vResult, T3);
    vResult = vmulq_f32(vResult, Position0);
    // Perform the Position1 term and add
    XMVECTOR vTemp = vmulq_f32(T3, Catmul3);
    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
    vTemp = vaddq_f32(vTemp, Catmul2);
    vResult = vmlaq_f32(vResult, vTemp, Position1);
    // Perform the Position2 term and add
    vTemp = vmulq_f32(T2, Catmul4);
    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
    vTemp = vaddq_f32(vTemp, T);
    vResult = vmlaq_f32(vResult, vTemp, Position2);
    // Position3 is the last term
    T3 = vsubq_f32(T3, T2);
    vResult = vmlaq_f32(vResult, T3, Position3);
    // Multiply by 0.5f and exit
    vResult = vmulq_f32(vResult, g_XMOneHalf);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 Catmul2 = { { { 2.0f, 2.0f, 2.0f, 2.0f } } };
    static const XMVECTORF32 Catmul3 = { { { 3.0f, 3.0f, 3.0f, 3.0f } } };
    static const XMVECTORF32 Catmul4 = { { { 4.0f, 4.0f, 4.0f, 4.0f } } };
    static const XMVECTORF32 Catmul5 = { { { 5.0f, 5.0f, 5.0f, 5.0f } } };
    // Cache T^2 and T^3
    XMVECTOR T2 = _mm_mul_ps(T, T);
    XMVECTOR T3 = _mm_mul_ps(T, T2);
    // Perform the Position0 term
    XMVECTOR vResult = _mm_add_ps(T2, T2);
    vResult = _mm_sub_ps(vResult, T);
    vResult = _mm_sub_ps(vResult, T3);
    vResult = _mm_mul_ps(vResult, Position0);
    // Perform the Position1 term and add
    XMVECTOR vTemp = _mm_mul_ps(T3, Catmul3);
    vTemp = XM_FNMADD_PS(T2, Catmul5, vTemp);
    vTemp = _mm_add_ps(vTemp, Catmul2);
    vResult = XM_FMADD_PS(vTemp, Position1, vResult);
    // Perform the Position2 term and add
    vTemp = _mm_mul_ps(T2, Catmul4);
    vTemp = XM_FNMADD_PS(T3, Catmul3, vTemp);
    vTemp = _mm_add_ps(vTemp, T);
    vResult = XM_FMADD_PS(vTemp, Position2, vResult);
    // Position3 is the last term
    T3 = _mm_sub_ps(T3, T2);
    vResult = XM_FMADD_PS(T3, Position3, vResult);
    // Multiply by 0.5f and exit
    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorBaryCentric
(
    FXMVECTOR Position0,
    FXMVECTOR Position1,
    FXMVECTOR Position2,
    float    f,
    float    g
) noexcept
{
    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
    XMVECTOR ScaleF = XMVectorReplicate(f);

    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
    XMVECTOR ScaleG = XMVectorReplicate(g);

    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR R1 = vsubq_f32(Position1, Position0);
    XMVECTOR R2 = vsubq_f32(Position2, Position0);
    R1 = vmlaq_n_f32(Position0, R1, f);
    return vmlaq_n_f32(R1, R2, g);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
    XMVECTOR SF = _mm_set_ps1(f);
    R1 = XM_FMADD_PS(R1, SF, Position0);
    XMVECTOR SG = _mm_set_ps1(g);
    return XM_FMADD_PS(R2, SG, R1);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVectorBaryCentricV
(
    FXMVECTOR Position0,
    FXMVECTOR Position1,
    FXMVECTOR Position2,
    GXMVECTOR F,
    HXMVECTOR G
) noexcept
{
    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)

#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);

    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
    Result = XMVectorMultiplyAdd(P20, G, Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR R1 = vsubq_f32(Position1, Position0);
    XMVECTOR R2 = vsubq_f32(Position2, Position0);
    R1 = vmlaq_f32(Position0, R1, F);
    return vmlaq_f32(R1, R2, G);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR R1 = _mm_sub_ps(Position1, Position0);
    XMVECTOR R2 = _mm_sub_ps(Position2, Position0);
    R1 = XM_FMADD_PS(R1, F, Position0);
    return XM_FMADD_PS(R2, G, R1);
#endif
}

/****************************************************************************
 *
 * 2D Vector
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Comparison operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2Equal
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    // z and w are don't care
    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
#endif
}


//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector2EqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;
    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] == V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] != V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    // z and w are don't care
    int iTest = _mm_movemask_ps(vTemp) & 3;
    uint32_t CR = 0;
    if (iTest == 3)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2EqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) == 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector2EqualIntR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;
    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
        (V1.vector4_u32[1] == V2.vector4_u32[1]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
        (V1.vector4_u32[1] != V2.vector4_u32[1]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3;
    uint32_t CR = 0;
    if (iTest == 3)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2NearEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR Epsilon
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
    float dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
    return ((dx <= Epsilon.vector4_f32[0]) &&
        (dy <= Epsilon.vector4_f32[1]));
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t vDelta = vsub_f32(vget_low_f32(V1), vget_low_f32(V2));
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
    uint32x2_t vTemp = vacle_f32(vDelta, vget_low_u32(Epsilon));
#else
    uint32x2_t vTemp = vcle_f32(vabs_f32(vDelta), vget_low_f32(Epsilon));
#endif
    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
    return (r == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Get the difference
    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
    // Get the absolute value of the difference
    XMVECTOR vTemp = _mm_setzero_ps();
    vTemp = _mm_sub_ps(vTemp, vDelta);
    vTemp = _mm_max_ps(vTemp, vDelta);
    vTemp = _mm_cmple_ps(vTemp, Epsilon);
    // z and w are don't care
    return (((_mm_movemask_ps(vTemp) & 3) == 0x3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2NotEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    // z and w are don't care
    return (((_mm_movemask_ps(vTemp) & 3) != 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2NotEqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vceq_u32(vget_low_u32(vreinterpretq_u32_f32(V1)), vget_low_u32(vreinterpretq_u32_f32(V2)));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 3) != 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2Greater
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    // z and w are don't care
    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector2GreaterR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;
    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] > V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] <= V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vcgt_f32(vget_low_f32(V1), vget_low_f32(V2));
    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp) & 3;
    uint32_t CR = 0;
    if (iTest == 3)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2GreaterOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector2GreaterOrEqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;
    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] >= V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] < V2.vector4_f32[1]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vcge_f32(vget_low_f32(V1), vget_low_f32(V2));
    uint64_t r = vget_lane_u64(vreinterpret_u64_u32(vTemp), 0);
    uint32_t CR = 0;
    if (r == 0xFFFFFFFFFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp) & 3;
    uint32_t CR = 0;
    if (iTest == 3)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2Less
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vclt_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2LessOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x2_t vTemp = vcle_f32(vget_low_f32(V1), vget_low_f32(V2));
    return (vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 3) == 3) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2InBounds
(
    FXMVECTOR V,
    FXMVECTOR Bounds
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    float32x2_t B = vget_low_f32(Bounds);
    // Test if less than or equal
    uint32x2_t ivTemp1 = vcle_f32(VL, B);
    // Negate the bounds
    float32x2_t vTemp2 = vneg_f32(B);
    // Test if greater or equal (Reversed)
    uint32x2_t ivTemp2 = vcle_f32(vTemp2, VL);
    // Blend answers
    ivTemp1 = vand_u32(ivTemp1, ivTemp2);
    // x and y in bounds?
    return (vget_lane_u64(vreinterpret_u64_u32(ivTemp1), 0) == 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test if less than or equal
    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
    // Negate the bounds
    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
    // Test if greater or equal (Reversed)
    vTemp2 = _mm_cmple_ps(vTemp2, V);
    // Blend answers
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
    // x and y in bounds? (z and w are don't care)
    return (((_mm_movemask_ps(vTemp1) & 0x3) == 0x3) != 0);
#endif
}

//------------------------------------------------------------------------------

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

inline bool XM_CALLCONV XMVector2IsNaN(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (XMISNAN(V.vector4_f32[0]) ||
        XMISNAN(V.vector4_f32[1]));
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Test against itself. NaN is always not equal
    uint32x2_t vTempNan = vceq_f32(VL, VL);
    // If x or y are NaN, the mask is zero
    return (vget_lane_u64(vreinterpret_u64_u32(vTempNan), 0) != 0xFFFFFFFFFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test against itself. NaN is always not equal
    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
    // If x or y are NaN, the mask is non-zero
    return ((_mm_movemask_ps(vTempNan) & 3) != 0);
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector2IsInfinite(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    return (XMISINF(V.vector4_f32[0]) ||
        XMISINF(V.vector4_f32[1]));
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Mask off the sign bit
    uint32x2_t vTemp = vand_u32(vget_low_u32(vreinterpretq_u32_f32(V)), vget_low_u32(g_XMAbsMask));
    // Compare to infinity
    vTemp = vceq_f32(vreinterpret_f32_u32(vTemp), vget_low_f32(g_XMInfinity));
    // If any are infinity, the signs are true.
    return vget_lane_u64(vreinterpret_u64_u32(vTemp), 0) != 0;
#elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bit
    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
    // Compare to infinity
    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
    // If x or z are infinity, the signs are true.
    return ((_mm_movemask_ps(vTemp) & 3) != 0);
#endif
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Dot
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result;
    Result.f[0] =
        Result.f[1] =
        Result.f[2] =
        Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Perform the dot product on x and y
    float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vget_low_f32(V2));
    vTemp = vpadd_f32(vTemp, vTemp);
    return vcombine_f32(vTemp, vTemp);
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_dp_ps(V1, V2, 0x3f);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vDot = _mm_mul_ps(V1, V2);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_moveldup_ps(vDot);
    return vDot;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V1, V2);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Cross
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]

#if defined(_XM_NO_INTRINSICS_)
    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = fCross;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Negate = { { { 1.f, -1.f, 0, 0 } } };

    float32x2_t vTemp = vmul_f32(vget_low_f32(V1), vrev64_f32(vget_low_f32(V2)));
    vTemp = vmul_f32(vTemp, vget_low_f32(Negate));
    vTemp = vpadd_f32(vTemp, vTemp);
    return vcombine_f32(vTemp, vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
    // Swap x and y
    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 1, 0, 1));
    // Perform the muls
    vResult = _mm_mul_ps(vResult, V1);
    // Splat y
    XMVECTOR vTemp = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(1, 1, 1, 1));
    // Sub the values
    vResult = _mm_sub_ss(vResult, vTemp);
    // Splat the cross product
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(0, 0, 0, 0));
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2LengthSq(FXMVECTOR V) noexcept
{
    return XMVector2Dot(V, V);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector2LengthSq(V);
    Result = XMVectorReciprocalSqrtEst(Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    // Reciprocal sqrt (estimate)
    vTemp = vrsqrte_f32(vTemp);
    return vcombine_f32(vTemp, vTemp);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
    return _mm_rsqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_rsqrt_ss(vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = _mm_rsqrt_ss(vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector2LengthSq(V);
    Result = XMVectorReciprocalSqrt(Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    // Reciprocal sqrt
    float32x2_t  S0 = vrsqrte_f32(vTemp);
    float32x2_t  P0 = vmul_f32(vTemp, S0);
    float32x2_t  R0 = vrsqrts_f32(P0, S0);
    float32x2_t  S1 = vmul_f32(S0, R0);
    float32x2_t  P1 = vmul_f32(vTemp, S1);
    float32x2_t  R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
    return _mm_div_ps(g_XMOne, vLengthSq);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ss(vTemp);
    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = _mm_sqrt_ss(vLengthSq);
    vLengthSq = _mm_div_ss(g_XMOne, vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2LengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector2LengthSq(V);
    Result = XMVectorSqrtEst(Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
    // Sqrt (estimate)
    float32x2_t Result = vrsqrte_f32(vTemp);
    Result = vmul_f32(vTemp, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ss(vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = _mm_sqrt_ss(vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Length(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector2LengthSq(V);
    Result = XMVectorSqrt(Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(vTemp, zero);
    // Sqrt
    float32x2_t S0 = vrsqrte_f32(vTemp);
    float32x2_t P0 = vmul_f32(vTemp, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(vTemp, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    Result = vmul_f32(vTemp, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ss(vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------
// XMVector2NormalizeEst uses a reciprocal estimate and
// returns QNaN on zero and infinite vectors.

inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector2ReciprocalLength(V);
    Result = XMVectorMultiply(V, Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    // Reciprocal sqrt (estimate)
    vTemp = vrsqrte_f32(vTemp);
    // Normalize
    float32x2_t Result = vmul_f32(VL, vTemp);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x3f);
    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
    return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_rsqrt_ss(vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    vLengthSq = _mm_mul_ps(vLengthSq, V);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has y splatted
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    // x+y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = _mm_rsqrt_ss(vLengthSq);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    vLengthSq = _mm_mul_ps(vLengthSq, V);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Normalize(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR vResult = XMVector2Length(V);
    float fLength = vResult.vector4_f32[0];

    // Prevent divide by zero
    if (fLength > 0)
    {
        fLength = 1.0f / fLength;
    }

    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
    return vResult;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    // Dot2
    float32x2_t vTemp = vmul_f32(VL, VL);
    vTemp = vpadd_f32(vTemp, vTemp);
    uint32x2_t VEqualsZero = vceq_f32(vTemp, vdup_n_f32(0));
    uint32x2_t VEqualsInf = vceq_f32(vTemp, vget_low_f32(g_XMInfinity));
    // Reciprocal sqrt (2 iterations of Newton-Raphson)
    float32x2_t S0 = vrsqrte_f32(vTemp);
    float32x2_t P0 = vmul_f32(vTemp, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(vTemp, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    vTemp = vmul_f32(S1, R1);
    // Normalize
    float32x2_t Result = vmul_f32(VL, vTemp);
    Result = vbsl_f32(VEqualsZero, vdup_n_f32(0), Result);
    Result = vbsl_f32(VEqualsInf, vget_low_f32(g_XMQNaN), Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x3f);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE3_INTRINSICS_)
    // Perform the dot product on x and y only
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_moveldup_ps(vLengthSq);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x and y only
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 1, 1, 1));
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2ClampLength
(
    FXMVECTOR V,
    float    LengthMin,
    float    LengthMax
) noexcept
{
    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2ClampLengthV
(
    FXMVECTOR V,
    FXMVECTOR LengthMin,
    FXMVECTOR LengthMax
) noexcept
{
    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));

    XMVECTOR LengthSq = XMVector2LengthSq(V);

    const XMVECTOR Zero = XMVectorZero();

    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);

    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);

    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);

    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);

    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
    Length = XMVectorSelect(LengthSq, Length, Select);
    Normal = XMVectorSelect(LengthSq, Normal, Select);

    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);

    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);

    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);

    // Preserve the original vector (with no precision loss) if the length falls within the given range
    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
    Result = XMVectorSelect(Result, V, Control);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Reflect
(
    FXMVECTOR Incident,
    FXMVECTOR Normal
) noexcept
{
    // Result = Incident - (2 * dot(Incident, Normal)) * Normal

    XMVECTOR Result;
    Result = XMVector2Dot(Incident, Normal);
    Result = XMVectorAdd(Result, Result);
    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Refract
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    float    RefractionIndex
) noexcept
{
    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
    return XMVector2RefractV(Incident, Normal, Index);
}

//------------------------------------------------------------------------------

// Return the refraction of a 2D vector
inline XMVECTOR XM_CALLCONV XMVector2RefractV
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    FXMVECTOR RefractionIndex
) noexcept
{
    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))

#if defined(_XM_NO_INTRINSICS_)

    float IDotN = (Incident.vector4_f32[0] * Normal.vector4_f32[0]) + (Incident.vector4_f32[1] * Normal.vector4_f32[1]);
    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    float RY = 1.0f - (IDotN * IDotN);
    float RX = 1.0f - (RY * RefractionIndex.vector4_f32[0] * RefractionIndex.vector4_f32[0]);
    RY = 1.0f - (RY * RefractionIndex.vector4_f32[1] * RefractionIndex.vector4_f32[1]);
    if (RX >= 0.0f)
    {
        RX = (RefractionIndex.vector4_f32[0] * Incident.vector4_f32[0]) - (Normal.vector4_f32[0] * ((RefractionIndex.vector4_f32[0] * IDotN) + sqrtf(RX)));
    }
    else
    {
        RX = 0.0f;
    }
    if (RY >= 0.0f)
    {
        RY = (RefractionIndex.vector4_f32[1] * Incident.vector4_f32[1]) - (Normal.vector4_f32[1] * ((RefractionIndex.vector4_f32[1] * IDotN) + sqrtf(RY)));
    }
    else
    {
        RY = 0.0f;
    }

    XMVECTOR vResult;
    vResult.vector4_f32[0] = RX;
    vResult.vector4_f32[1] = RY;
    vResult.vector4_f32[2] = 0.0f;
    vResult.vector4_f32[3] = 0.0f;
    return vResult;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t IL = vget_low_f32(Incident);
    float32x2_t NL = vget_low_f32(Normal);
    float32x2_t RIL = vget_low_f32(RefractionIndex);
    // Get the 2D Dot product of Incident-Normal
    float32x2_t vTemp = vmul_f32(IL, NL);
    float32x2_t IDotN = vpadd_f32(vTemp, vTemp);
    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    vTemp = vmls_f32(vget_low_f32(g_XMOne), IDotN, IDotN);
    vTemp = vmul_f32(vTemp, RIL);
    vTemp = vmls_f32(vget_low_f32(g_XMOne), vTemp, RIL);
    // If any terms are <=0, sqrt() will fail, punt to zero
    uint32x2_t vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero));
    // Sqrt(vTemp)
    float32x2_t S0 = vrsqrte_f32(vTemp);
    float32x2_t P0 = vmul_f32(vTemp, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(vTemp, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    float32x2_t S2 = vmul_f32(S1, R1);
    vTemp = vmul_f32(vTemp, S2);
    // R = RefractionIndex * IDotN + sqrt(R)
    vTemp = vmla_f32(vTemp, RIL, IDotN);
    // Result = RefractionIndex * Incident - Normal * R
    float32x2_t vResult = vmul_f32(RIL, IL);
    vResult = vmls_f32(vResult, vTemp, NL);
    vResult = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(vResult), vMask));
    return vcombine_f32(vResult, vResult);
#elif defined(_XM_SSE_INTRINSICS_)
    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
    // Get the 2D Dot product of Incident-Normal
    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    XMVECTOR vTemp = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
    vTemp = _mm_mul_ps(vTemp, RefractionIndex);
    vTemp = XM_FNMADD_PS(vTemp, RefractionIndex, g_XMOne);
    // If any terms are <=0, sqrt() will fail, punt to zero
    XMVECTOR vMask = _mm_cmpgt_ps(vTemp, g_XMZero);
    // R = RefractionIndex * IDotN + sqrt(R)
    vTemp = _mm_sqrt_ps(vTemp);
    vTemp = XM_FMADD_PS(RefractionIndex, IDotN, vTemp);
    // Result = RefractionIndex * Incident - Normal * R
    XMVECTOR vResult = _mm_mul_ps(RefractionIndex, Incident);
    vResult = XM_FNMADD_PS(vTemp, Normal, vResult);
    vResult = _mm_and_ps(vResult, vMask);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Orthogonal(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            -V.vector4_f32[1],
            V.vector4_f32[0],
            0.f,
            0.f
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Negate = { { { -1.f, 1.f, 0, 0 } } };
    const float32x2_t zero = vdup_n_f32(0);

    float32x2_t VL = vget_low_f32(V);
    float32x2_t Result = vmul_f32(vrev64_f32(VL), vget_low_f32(Negate));
    return vcombine_f32(Result, zero);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 2, 0, 1));
    vResult = _mm_mul_ps(vResult, g_XMNegateX);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormalsEst
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector2Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
    Result = XMVectorACosEst(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenNormals
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector2Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
    Result = XMVectorACos(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2AngleBetweenVectors
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
    XMVECTOR L2 = XMVector2ReciprocalLength(V2);

    XMVECTOR Dot = XMVector2Dot(V1, V2);

    L1 = XMVectorMultiply(L1, L2);

    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);

    return XMVectorACos(CosAngle);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2LinePointDistance
(
    FXMVECTOR LinePoint1,
    FXMVECTOR LinePoint2,
    FXMVECTOR Point
) noexcept
{
    // Given a vector PointVector from LinePoint1 to Point and a vector
    // LineVector from LinePoint1 to LinePoint2, the scaled distance
    // PointProjectionScale from LinePoint1 to the perpendicular projection
    // of PointVector onto the line is defined as:
    //
    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)

    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);

    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);

    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);

    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);

    return XMVector2Length(DistanceVector);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2IntersectLine
(
    FXMVECTOR Line1Point1,
    FXMVECTOR Line1Point2,
    FXMVECTOR Line2Point1,
    GXMVECTOR Line2Point2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);

    XMVECTOR C1 = XMVector2Cross(V1, V2);
    XMVECTOR C2 = XMVector2Cross(V2, V3);

    XMVECTOR Result;
    const XMVECTOR Zero = XMVectorZero();
    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
    {
        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
        {
            // Coincident
            Result = g_XMInfinity.v;
        }
        else
        {
            // Parallel
            Result = g_XMQNaN.v;
        }
    }
    else
    {
        // Intersection point = Line1Point1 + V1 * (C2 / C1)
        XMVECTOR Scale = XMVectorReciprocal(C1);
        Scale = XMVectorMultiply(C2, Scale);
        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
    }

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
    // Generate the cross products
    XMVECTOR C1 = XMVector2Cross(V1, V2);
    XMVECTOR C2 = XMVector2Cross(V2, V3);
    // If C1 is not close to epsilon, use the calculated value
    XMVECTOR vResultMask = _mm_setzero_ps();
    vResultMask = _mm_sub_ps(vResultMask, C1);
    vResultMask = _mm_max_ps(vResultMask, C1);
    // 0xFFFFFFFF if the calculated value is to be used
    vResultMask = _mm_cmpgt_ps(vResultMask, g_XMEpsilon);
    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
    XMVECTOR vFailMask = _mm_setzero_ps();
    vFailMask = _mm_sub_ps(vFailMask, C2);
    vFailMask = _mm_max_ps(vFailMask, C2);
    vFailMask = _mm_cmple_ps(vFailMask, g_XMEpsilon);
    XMVECTOR vFail = _mm_and_ps(vFailMask, g_XMInfinity);
    vFailMask = _mm_andnot_ps(vFailMask, g_XMQNaN);
    // vFail is NAN or INF
    vFail = _mm_or_ps(vFail, vFailMask);
    // Intersection point = Line1Point1 + V1 * (C2 / C1)
    XMVECTOR vResult = _mm_div_ps(C2, C1);
    vResult = XM_FMADD_PS(vResult, V1, Line1Point1);
    // Use result, or failure value
    vResult = _mm_and_ps(vResult, vResultMask);
    vResultMask = _mm_andnot_ps(vResultMask, vFail);
    vResult = _mm_or_ps(vResult, vResultMask);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2Transform
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    float32x4_t Result = vmlaq_lane_f32(M.r[3], M.r[1], VL, 1); // Y
    return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
    vResult = XM_FMADD_PS(vResult, M.r[1], M.r[3]);
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------


inline XMFLOAT4* XM_CALLCONV XMVector2TransformStream
(
    XMFLOAT4* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT2* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT2));
    

    assert(OutputStride >= sizeof(XMFLOAT4));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
        Result = XMVectorMultiplyAdd(X, row0, Result);

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
#endif

        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT4)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT2) * 4;

                float32x2_t r3 = vget_low_f32(row3);
                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(row3);
                r = vget_high_f32(row0);
                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                float32x4x4_t R;
                R.val[0] = vResult0;
                R.val[1] = vResult1;
                R.val[2] = vResult2;
                R.val[3] = vResult3;

                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
                pOutputVector += sizeof(XMFLOAT4) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y

        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_AVX2_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);

        if (InputStride == sizeof(XMFLOAT2))
        {
            if (OutputStride == sizeof(XMFLOAT4))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                        __m256 vTempA = _mm256_mul_ps(X1, row0);
                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                        vTempA = _mm256_add_ps(vTempA, vTempB);
                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                        X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1);
                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X2);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        i += 4;
                    }
                }
                else
                {
                    // Packed input, packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                        __m256 vTempA = _mm256_mul_ps(X1, row0);
                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                        vTempA = _mm256_add_ps(vTempA, vTempB);
                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                        X1 = _mm256_insertf128_ps(vTempA, _mm256_castps256_ps128(vTempA2), 1);
                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        X2 = _mm256_insertf128_ps(vTempA2, _mm256_extractf128_ps(vTempA, 1), 0);
                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X2);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 4;

                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                    __m256 vTempA = _mm256_mul_ps(X1, row0);
                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                    vTempA = _mm256_add_ps(vTempA, vTempB);
                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempA));
                    pOutputVector += OutputStride;

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempA2));
                    pOutputVector += OutputStride;

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempA, 1));
                    pOutputVector += OutputStride;

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempA2, 1));
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    if (i < VectorCount)
    {
        const XMVECTOR row0 = M.r[0];
        const XMVECTOR row1 = M.r[1];
        const XMVECTOR row3 = M.r[3];

        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);

            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t two = VectorCount >> 1;
    if (two > 0)
    {
        if (InputStride == sizeof(XMFLOAT2))
        {
            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
            {
                // Packed input, aligned output
                for (size_t j = 0; j < two; ++j)
                {
                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 2;

                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                    vTemp = XM_FMADD_PS(Y, row1, row3);
                    vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 2;
                }
            }
            else
            {
                // Packed input, unaligned output
                for (size_t j = 0; j < two; ++j)
                {
                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 2;

                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                    vTemp = XM_FMADD_PS(Y, row1, row3);
                    vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 2;
                }
            }
        }
    }

    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
    {
        if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
        {
            // Aligned input, aligned output
            for (; i < VectorCount; i++)
            {
                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
                pInputVector += InputStride;

                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                vTemp = _mm_add_ps(vTemp, vTemp2);

                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                pOutputVector += OutputStride;
            }
        }
        else
        {
            // Aligned input, unaligned output
            for (; i < VectorCount; i++)
            {
                XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
                pInputVector += InputStride;

                XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                vTemp = _mm_add_ps(vTemp, vTemp2);

                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                pOutputVector += OutputStride;
            }
        }
    }
    else
    {
        // Unaligned input
        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);

            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2TransformCoord
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    XMVECTOR W = XMVectorSplatW(Result);
    return XMVectorDivide(Result, W);
}

//------------------------------------------------------------------------------


inline XMFLOAT2* XM_CALLCONV XMVector2TransformCoordStream
(
    XMFLOAT2* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT2* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT2));
    

    assert(OutputStride >= sizeof(XMFLOAT2));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
        Result = XMVectorMultiplyAdd(X, row0, Result);

        XMVECTOR W = XMVectorSplatW(Result);

        Result = XMVectorDivide(Result, W);

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
#endif

        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT2) * 4;

                float32x2_t r3 = vget_low_f32(row3);
                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(row3);
                r = vget_high_f32(row0);
                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
                V.val[0] = vdivq_f32(vResult0, W);
                V.val[1] = vdivq_f32(vResult1, W);
#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);
                S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);

                V.val[0] = vmulq_f32(vResult0, Reciprocal);
                V.val[1] = vmulq_f32(vResult1, Reciprocal);
#endif

                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT2) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, V, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y

        V = vget_high_f32(vResult);
        float32x2_t W = vdup_lane_f32(V, 1);

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
        V = vget_low_f32(vResult);
        V = vdiv_f32(V, W);
#else
        // 2 iterations of Newton-Raphson refinement of reciprocal for W
        float32x2_t Reciprocal = vrecpe_f32(W);
        float32x2_t S = vrecps_f32(Reciprocal, W);
        Reciprocal = vmul_f32(S, Reciprocal);
        S = vrecps_f32(Reciprocal, W);
        Reciprocal = vmul_f32(S, Reciprocal);

        V = vget_low_f32(vResult);
        V = vmul_f32(V, Reciprocal);
#endif

        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_AVX2_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);

        if (InputStride == sizeof(XMFLOAT2))
        {
            if (OutputStride == sizeof(XMFLOAT2))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                        __m256 vTempA = _mm256_mul_ps(X1, row0);
                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                        vTempA = _mm256_add_ps(vTempA, vTempB);
                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
                        vTempA = _mm256_div_ps(vTempA, W);

                        W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
                        vTempA2 = _mm256_div_ps(vTempA2, W);

                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT2) * 4;

                        i += 4;
                    }
                }
                else
                {
                    // Packed input, packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                        __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                        __m256 vTempA = _mm256_mul_ps(X1, row0);
                        __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                        vTempA = _mm256_add_ps(vTempA, vTempB);
                        vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                        __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
                        vTempA = _mm256_div_ps(vTempA, W);

                        W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
                        vTempA2 = _mm256_div_ps(vTempA2, W);

                        X1 = _mm256_shuffle_ps(vTempA, vTempA2, 0x44);
                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT2) * 4;

                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 4;

                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                    __m256 vTempB = _mm256_fmadd_ps(Y1, row1, row3);
                    __m256 vTempB2 = _mm256_fmadd_ps(Y2, row1, row3);
                    __m256 vTempA = _mm256_mul_ps(X1, row0);
                    __m256 vTempA2 = _mm256_mul_ps(X2, row0);
                    vTempA = _mm256_add_ps(vTempA, vTempB);
                    vTempA2 = _mm256_add_ps(vTempA2, vTempB2);

                    __m256 W = _mm256_shuffle_ps(vTempA, vTempA, _MM_SHUFFLE(3, 3, 3, 3));
                    vTempA = _mm256_div_ps(vTempA, W);

                    W = _mm256_shuffle_ps(vTempA2, vTempA2, _MM_SHUFFLE(3, 3, 3, 3));
                    vTempA2 = _mm256_div_ps(vTempA2, W);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_castps256_ps128(vTempA2)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_extractf128_ps(vTempA2, 1)));
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    if (i < VectorCount)
    {
        const XMVECTOR row0 = M.r[0];
        const XMVECTOR row1 = M.r[1];
        const XMVECTOR row3 = M.r[3];

        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);

            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
            vTemp = _mm_div_ps(vTemp, W);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t two = VectorCount >> 1;
    if (two > 0)
    {
        if (InputStride == sizeof(XMFLOAT2))
        {
            if (OutputStride == sizeof(XMFLOAT2))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 2;

                        // Result 1
                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        XMVECTOR V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                        vTemp = XM_FMADD_PS(Y, row1, row3);
                        vTemp2 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        XMVECTOR V2 = _mm_div_ps(vTemp, W);

                        vTemp = _mm_movelh_ps(V1, V2);

                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                        pOutputVector += sizeof(XMFLOAT2) * 2;

                        i += 2;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 2;

                        // Result 1
                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                        XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        XMVECTOR V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                        vTemp = XM_FMADD_PS(Y, row1, row3);
                        vTemp2 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        XMVECTOR V2 = _mm_div_ps(vTemp, W);

                        vTemp = _mm_movelh_ps(V1, V2);

                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                        pOutputVector += sizeof(XMFLOAT2) * 2;

                        i += 2;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < two; ++j)
                {
                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 2;

                    // Result 1
                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    pOutputVector += OutputStride;

                    // Result 2
                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                    vTemp = XM_FMADD_PS(Y, row1, row3);
                    vTemp2 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    pOutputVector += OutputStride;

                    i += 2;
                }
            }
        }
    }

    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
    {
        // Aligned input
        for (; i < VectorCount; i++)
        {
            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);

            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

            vTemp = _mm_div_ps(vTemp, W);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }
    else
    {
        // Unaligned input
        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Y, row1, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);

            XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

            vTemp = _mm_div_ps(vTemp, W);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector2TransformNormal
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    float32x4_t Result = vmulq_lane_f32(M.r[1], VL, 1); // Y
    return vmlaq_lane_f32(Result, M.r[0], VL, 0); // X
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
    vResult = _mm_mul_ps(vResult, M.r[1]);
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------


inline XMFLOAT2* XM_CALLCONV XMVector2TransformNormalStream
(
    XMFLOAT2* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT2* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT2));
    

    assert(OutputStride >= sizeof(XMFLOAT2));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pInputVector));
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiply(Y, row1);
        Result = XMVectorMultiplyAdd(X, row0, Result);

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
#endif

        XMStoreFloat2(reinterpret_cast<XMFLOAT2*>(pOutputVector), Result);

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT2)) && (OutputStride == sizeof(XMFLOAT2)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x2_t V = vld2q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT2) * 4;

                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx

                XM_PREFETCH(pInputVector);
                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));
                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                V.val[0] = vResult0;
                V.val[1] = vResult1;

                vst2q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT2) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t V = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR vResult = vmulq_lane_f32(row0, V, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, V, 1); // Y

        V = vget_low_f32(vResult);
        vst1_f32(reinterpret_cast<float*>(pOutputVector), V);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_AVX2_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);

        if (InputStride == sizeof(XMFLOAT2))
        {
            if (OutputStride == sizeof(XMFLOAT2))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);

                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT2) * 4;

                        i += 4;
                    }
                }
                else
                {
                    // Packed input, packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 4;

                        __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                        __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                        __m256 vTempA = _mm256_mul_ps(Y1, row1);
                        __m256 vTempB = _mm256_mul_ps(Y2, row1);
                        vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
                        vTempB = _mm256_fmadd_ps(X2, row0, vTempB);

                        X1 = _mm256_shuffle_ps(vTempA, vTempB, 0x44);
                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), X1);
                        pOutputVector += sizeof(XMFLOAT2) * 4;

                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 4;

                    __m256 Y2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));
                    __m256 X2 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                    __m256 Y1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                    __m256 X1 = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));

                    __m256 vTempA = _mm256_mul_ps(Y1, row1);
                    __m256 vTempB = _mm256_mul_ps(Y2, row1);
                    vTempA = _mm256_fmadd_ps(X1, row0, vTempA);
                    vTempB = _mm256_fmadd_ps(X2, row0, vTempB);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_castps256_ps128(vTempA)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_castps256_ps128(vTempB)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_extractf128_ps(vTempA, 1)));
                    pOutputVector += OutputStride;

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector),
                        _mm_castps_pd(_mm256_extractf128_ps(vTempB, 1)));
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    if (i < VectorCount)
    {
        const XMVECTOR row0 = M.r[0];
        const XMVECTOR row1 = M.r[1];

        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
            vTemp = XM_FMADD_PS(X, row0, vTemp);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];

    size_t i = 0;
    size_t two = VectorCount >> 1;
    if (two > 0)
    {
        if (InputStride == sizeof(XMFLOAT2))
        {
            if (OutputStride == sizeof(XMFLOAT2))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 2;

                        // Result 1
                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);

                        // Result 2
                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                        vTemp = _mm_mul_ps(Y, row1);
                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);

                        vTemp = _mm_movelh_ps(V1, V2);

                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                        pOutputVector += sizeof(XMFLOAT2) * 2;

                        i += 2;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT2) * 2;

                        // Result 1
                        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = _mm_mul_ps(Y, row1);
                        XMVECTOR V1 = XM_FMADD_PS(X, row0, vTemp);

                        // Result 2
                        Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                        X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                        vTemp = _mm_mul_ps(Y, row1);
                        XMVECTOR V2 = XM_FMADD_PS(X, row0, vTemp);

                        vTemp = _mm_movelh_ps(V1, V2);

                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                        pOutputVector += sizeof(XMFLOAT2) * 2;

                        i += 2;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < two; ++j)
                {
                    XMVECTOR V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT2) * 2;

                    // Result 1
                    XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = _mm_mul_ps(Y, row1);
                    vTemp = XM_FMADD_PS(X, row0, vTemp);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    pOutputVector += OutputStride;

                    // Result 2
                    Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));
                    X = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));

                    vTemp = _mm_mul_ps(Y, row1);
                    vTemp = XM_FMADD_PS(X, row0, vTemp);

                    _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
                    pOutputVector += OutputStride;

                    i += 2;
                }
            }
        }
    }

    if (!(reinterpret_cast<uintptr_t>(pInputVector) & 0xF) && !(InputStride & 0xF))
    {
        // Aligned input
        for (; i < VectorCount; i++)
        {
            XMVECTOR V = _mm_castsi128_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
            vTemp = XM_FMADD_PS(X, row0, vTemp);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }
    else
    {
        // Unaligned input
        for (; i < VectorCount; i++)
        {
            __m128 xy = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(pInputVector)));
            pInputVector += InputStride;

            XMVECTOR Y = XM_PERMUTE_PS(xy, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(xy, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = _mm_mul_ps(Y, row1);
            vTemp = XM_FMADD_PS(X, row0, vTemp);

            _mm_store_sd(reinterpret_cast<double*>(pOutputVector), _mm_castps_pd(vTemp));
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

/****************************************************************************
 *
 * 3D Vector
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Comparison operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3Equal
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector3EqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] == V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] != V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;

    uint32_t CR = 0;
    if (r == 0xFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp) & 7;
    uint32_t CR = 0;
    if (iTest == 7)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3EqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector3EqualIntR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
        (V1.vector4_u32[2] == V2.vector4_u32[2]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
        (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
        (V1.vector4_u32[2] != V2.vector4_u32[2]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;

    uint32_t CR = 0;
    if (r == 0xFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7;
    uint32_t CR = 0;
    if (iTemp == 7)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTemp)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3NearEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR Epsilon
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float dx, dy, dz;

    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
    return (((dx <= Epsilon.vector4_f32[0]) &&
        (dy <= Epsilon.vector4_f32[1]) &&
        (dz <= Epsilon.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vDelta = vsubq_f32(V1, V2);
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
#else
    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
#endif
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Get the difference
    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
    // Get the absolute value of the difference
    XMVECTOR vTemp = _mm_setzero_ps();
    vTemp = _mm_sub_ps(vTemp, vDelta);
    vTemp = _mm_max_ps(vTemp, vDelta);
    vTemp = _mm_cmple_ps(vTemp, Epsilon);
    // w is don't care
    return (((_mm_movemask_ps(vTemp) & 7) == 0x7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3NotEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) != 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3NotEqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) & 7) != 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3Greater
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgtq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector3GreaterR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] > V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] <= V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgtq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;

    uint32_t CR = 0;
    if (r == 0xFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    uint32_t CR = 0;
    int iTest = _mm_movemask_ps(vTemp) & 7;
    if (iTest == 7)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3GreaterOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgeq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector3GreaterOrEqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;
    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] >= V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] < V2.vector4_f32[2]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgeq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU;

    uint32_t CR = 0;
    if (r == 0xFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    uint32_t CR = 0;
    int iTest = _mm_movemask_ps(vTemp) & 7;
    if (iTest == 7)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3Less
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcltq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3LessOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcleq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
    return (((_mm_movemask_ps(vTemp) & 7) == 7) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3InBounds
(
    FXMVECTOR V,
    FXMVECTOR Bounds
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test if less than or equal
    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
    // Negate the bounds
    float32x4_t vTemp2 = vnegq_f32(Bounds);
    // Test if greater or equal (Reversed)
    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
    // Blend answers
    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
    // in bounds?
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) & 0xFFFFFFU) == 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test if less than or equal
    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
    // Negate the bounds
    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
    // Test if greater or equal (Reversed)
    vTemp2 = _mm_cmple_ps(vTemp2, V);
    // Blend answers
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
    // x,y and z in bounds? (w is don't care)
    return (((_mm_movemask_ps(vTemp1) & 0x7) == 0x7) != 0);
#else
    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
#endif
}

//------------------------------------------------------------------------------

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

inline bool XM_CALLCONV XMVector3IsNaN(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    return (XMISNAN(V.vector4_f32[0]) ||
        XMISNAN(V.vector4_f32[1]) ||
        XMISNAN(V.vector4_f32[2]));

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test against itself. NaN is always not equal
    uint32x4_t vTempNan = vceqq_f32(V, V);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    // If x or y or z are NaN, the mask is zero
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0xFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test against itself. NaN is always not equal
    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
    // If x or y or z are NaN, the mask is non-zero
    return ((_mm_movemask_ps(vTempNan) & 7) != 0);
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector3IsInfinite(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (XMISINF(V.vector4_f32[0]) ||
        XMISINF(V.vector4_f32[1]) ||
        XMISINF(V.vector4_f32[2]));
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Mask off the sign bit
    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    // Compare to infinity
    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
    // If any are infinity, the signs are true.
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return ((vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) & 0xFFFFFFU) != 0);
#elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bit
    __m128 vTemp = _mm_and_ps(V, g_XMAbsMask);
    // Compare to infinity
    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
    // If x,y or z are infinity, the signs are true.
    return ((_mm_movemask_ps(vTemp) & 7) != 0);
#endif
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Dot
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
    XMVECTORF32 vResult;
    vResult.f[0] =
        vResult.f[1] =
        vResult.f[2] =
        vResult.f[3] = fValue;
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vTemp = vmulq_f32(V1, V2);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    return vcombine_f32(v1, v1);
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_dp_ps(V1, V2, 0x7f);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
    vTemp = _mm_and_ps(vTemp, g_XMMask3);
    vTemp = _mm_hadd_ps(vTemp, vTemp);
    return _mm_hadd_ps(vTemp, vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V1, V2);
    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
    // Result.vector4_f32[0] = x+y
    vDot = _mm_add_ss(vDot, vTemp);
    // x=Dot.vector4_f32[2]
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // Result.vector4_f32[0] = (x+y)+z
    vDot = _mm_add_ss(vDot, vTemp);
    // Splat x
    return XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Cross
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { {
            (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
            (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
            (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
            0.0f
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t v1xy = vget_low_f32(V1);
    float32x2_t v2xy = vget_low_f32(V2);

    float32x2_t v1yx = vrev64_f32(v1xy);
    float32x2_t v2yx = vrev64_f32(v2xy);

    float32x2_t v1zz = vdup_lane_f32(vget_high_f32(V1), 0);
    float32x2_t v2zz = vdup_lane_f32(vget_high_f32(V2), 0);

    XMVECTOR vResult = vmulq_f32(vcombine_f32(v1yx, v1xy), vcombine_f32(v2zz, v2yx));
    vResult = vmlsq_f32(vResult, vcombine_f32(v1zz, v1yx), vcombine_f32(v2yx, v2xy));
    vResult = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vResult), g_XMFlipY));
    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vResult), g_XMMask3));
#elif defined(_XM_SSE_INTRINSICS_)
    // y1,z1,x1,w1
    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(3, 0, 2, 1));
    // z2,x2,y2,w2
    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(3, 1, 0, 2));
    // Perform the left operation
    XMVECTOR vResult = _mm_mul_ps(vTemp1, vTemp2);
    // z1,x1,y1,w1
    vTemp1 = XM_PERMUTE_PS(vTemp1, _MM_SHUFFLE(3, 0, 2, 1));
    // y2,z2,x2,w2
    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(3, 1, 0, 2));
    // Perform the right operation
    vResult = XM_FNMADD_PS(vTemp1, vTemp2, vResult);
    // Set w to zero
    return _mm_and_ps(vResult, g_XMMask3);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3LengthSq(FXMVECTOR V) noexcept
{
    return XMVector3Dot(V, V);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector3LengthSq(V);
    Result = XMVectorReciprocalSqrtEst(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32(v1);
    return vcombine_f32(v2, v2);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    return _mm_rsqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_rsqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and y
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
    // x+z, y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // y,y,y,y
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // x+z+y,??,??,??
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // Splat the length squared
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the reciprocal
    vLengthSq = _mm_rsqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector3LengthSq(V);
    Result = XMVectorReciprocalSqrt(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    // Reciprocal sqrt
    float32x2_t  S0 = vrsqrte_f32(v1);
    float32x2_t  P0 = vmul_f32(v1, S0);
    float32x2_t  R0 = vrsqrts_f32(P0, S0);
    float32x2_t  S1 = vmul_f32(S0, R0);
    float32x2_t  P1 = vmul_f32(v1, S1);
    float32x2_t  R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
    return _mm_div_ps(g_XMOne, vLengthSq);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vDot = _mm_mul_ps(V, V);
    vDot = _mm_and_ps(vDot, g_XMMask3);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_sqrt_ps(vDot);
    vDot = _mm_div_ps(g_XMOne, vDot);
    return vDot;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V, V);
    // x=Dot.y, y=Dot.z
    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
    // Result.x = x+y
    vDot = _mm_add_ss(vDot, vTemp);
    // x=Dot.z
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // Result.x = (x+y)+z
    vDot = _mm_add_ss(vDot, vTemp);
    // Splat x
    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the reciprocal
    vDot = _mm_sqrt_ps(vDot);
    // Get the reciprocal
    vDot = _mm_div_ps(g_XMOne, vDot);
    return vDot;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3LengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector3LengthSq(V);
    Result = XMVectorSqrtEst(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
    // Sqrt (estimate)
    float32x2_t Result = vrsqrte_f32(v1);
    Result = vmul_f32(v1, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and y
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
    // x+z, y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // y,y,y,y
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // x+z+y,??,??,??
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // Splat the length squared
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Length(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector3LengthSq(V);
    Result = XMVectorSqrt(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
    // Sqrt
    float32x2_t S0 = vrsqrte_f32(v1);
    float32x2_t P0 = vmul_f32(v1, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(v1, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    Result = vmul_f32(v1, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and y
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 2, 1, 2));
    // x+z, y
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // y,y,y,y
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // x+z+y,??,??,??
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    // Splat the length squared
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------
// XMVector3NormalizeEst uses a reciprocal estimate and
// returns QNaN on zero and infinite vectors.

inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector3ReciprocalLength(V);
    Result = XMVectorMultiply(V, Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32(v1);
    // Normalize
    return vmulq_f32(V, vcombine_f32(v2, v2));
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0x7f);
    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
    return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vDot = _mm_mul_ps(V, V);
    vDot = _mm_and_ps(vDot, g_XMMask3);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_rsqrt_ps(vDot);
    vDot = _mm_mul_ps(vDot, V);
    return vDot;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(V, V);
    // x=Dot.y, y=Dot.z
    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
    // Result.x = x+y
    vDot = _mm_add_ss(vDot, vTemp);
    // x=Dot.z
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // Result.x = (x+y)+z
    vDot = _mm_add_ss(vDot, vTemp);
    // Splat x
    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the reciprocal
    vDot = _mm_rsqrt_ps(vDot);
    // Perform the normalization
    vDot = _mm_mul_ps(vDot, V);
    return vDot;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float fLength;
    XMVECTOR vResult;

    vResult = XMVector3Length(V);
    fLength = vResult.vector4_f32[0];

    // Prevent divide by zero
    if (fLength > 0)
    {
        fLength = 1.0f / fLength;
    }

    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
    return vResult;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot3
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vpadd_f32(v1, v1);
    v2 = vdup_lane_f32(v2, 0);
    v1 = vadd_f32(v1, v2);
    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
    // Reciprocal sqrt (2 iterations of Newton-Raphson)
    float32x2_t S0 = vrsqrte_f32(v1);
    float32x2_t P0 = vmul_f32(v1, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(v1, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    v2 = vmul_f32(S1, R1);
    // Normalize
    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult);
    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE3_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3ClampLength
(
    FXMVECTOR V,
    float    LengthMin,
    float    LengthMax
) noexcept
{
    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);

    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3ClampLengthV
(
    FXMVECTOR V,
    FXMVECTOR LengthMin,
    FXMVECTOR LengthMax
) noexcept
{
    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));

    XMVECTOR LengthSq = XMVector3LengthSq(V);

    const XMVECTOR Zero = XMVectorZero();

    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);

    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);

    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);

    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);

    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
    Length = XMVectorSelect(LengthSq, Length, Select);
    Normal = XMVectorSelect(LengthSq, Normal, Select);

    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);

    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);

    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);

    // Preserve the original vector (with no precision loss) if the length falls within the given range
    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
    Result = XMVectorSelect(Result, V, Control);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Reflect
(
    FXMVECTOR Incident,
    FXMVECTOR Normal
) noexcept
{
    // Result = Incident - (2 * dot(Incident, Normal)) * Normal

    XMVECTOR Result = XMVector3Dot(Incident, Normal);
    Result = XMVectorAdd(Result, Result);
    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Refract
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    float    RefractionIndex
) noexcept
{
    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
    return XMVector3RefractV(Incident, Normal, Index);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3RefractV
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    FXMVECTOR RefractionIndex
) noexcept
{
    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))

#if defined(_XM_NO_INTRINSICS_)

    const XMVECTOR  Zero = XMVectorZero();

    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);

    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
    R = XMVectorMultiply(R, RefractionIndex);
    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);

    if (XMVector4LessOrEqual(R, Zero))
    {
        // Total internal reflection
        return Zero;
    }
    else
    {
        // R = RefractionIndex * IDotN + sqrt(R)
        R = XMVectorSqrt(R);
        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);

        // Result = RefractionIndex * Incident - Normal * R
        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);

        return Result;
    }

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);

    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
    R = vmulq_f32(R, RefractionIndex);
    R = vmlsq_f32(g_XMOne, R, RefractionIndex);

    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));

    float32x4_t vResult;
    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU)
    {
        // Total internal reflection
        vResult = g_XMZero;
    }
    else
    {
        // Sqrt(R)
        float32x4_t S0 = vrsqrteq_f32(R);
        float32x4_t P0 = vmulq_f32(R, S0);
        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
        float32x4_t S1 = vmulq_f32(S0, R0);
        float32x4_t P1 = vmulq_f32(R, S1);
        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
        float32x4_t S2 = vmulq_f32(S1, R1);
        R = vmulq_f32(R, S2);
        // R = RefractionIndex * IDotN + sqrt(R)
        R = vmlaq_f32(R, RefractionIndex, IDotN);
        // Result = RefractionIndex * Incident - Normal * R
        vResult = vmulq_f32(RefractionIndex, Incident);
        vResult = vmlsq_f32(vResult, R, Normal);
    }
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
    R = XM_FNMADD_PS(R, R2, g_XMOne);

    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
    if (_mm_movemask_ps(vResult) == 0x0f)
    {
        // Total internal reflection
        vResult = g_XMZero;
    }
    else
    {
        // R = RefractionIndex * IDotN + sqrt(R)
        R = _mm_sqrt_ps(R);
        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
        // Result = RefractionIndex * Incident - Normal * R
        vResult = _mm_mul_ps(RefractionIndex, Incident);
        vResult = XM_FNMADD_PS(R, Normal, vResult);
    }
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Orthogonal(FXMVECTOR V) noexcept
{
    XMVECTOR Zero = XMVectorZero();
    XMVECTOR Z = XMVectorSplatZ(V);
    XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);

    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);

    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);

    XMVECTOR S = XMVectorAdd(YZYY, Z);
    XMVECTOR D = XMVectorSubtract(YZYY, Z);

    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);

    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);

    return XMVectorSelect(R1, R0, Select);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormalsEst
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector3Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
    Result = XMVectorACosEst(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenNormals
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector3Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
    Result = XMVectorACos(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3AngleBetweenVectors
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
    XMVECTOR L2 = XMVector3ReciprocalLength(V2);

    XMVECTOR Dot = XMVector3Dot(V1, V2);

    L1 = XMVectorMultiply(L1, L2);

    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);

    return XMVectorACos(CosAngle);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3LinePointDistance
(
    FXMVECTOR LinePoint1,
    FXMVECTOR LinePoint2,
    FXMVECTOR Point
) noexcept
{
    // Given a vector PointVector from LinePoint1 to Point and a vector
    // LineVector from LinePoint1 to LinePoint2, the scaled distance
    // PointProjectionScale from LinePoint1 to the perpendicular projection
    // of PointVector onto the line is defined as:
    //
    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)

    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);

    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);

    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);

    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);

    return XMVector3Length(DistanceVector);
}

//------------------------------------------------------------------------------


inline void XM_CALLCONV XMVector3ComponentsFromNormal
(
    XMVECTOR* pParallel,
    XMVECTOR* pPerpendicular,
    FXMVECTOR  V,
    FXMVECTOR  Normal
) noexcept
{
    assert(pParallel != nullptr);
    assert(pPerpendicular != nullptr);

    XMVECTOR Scale = XMVector3Dot(V, Normal);

    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);

    *pParallel = Parallel;
    *pPerpendicular = XMVectorSubtract(V, Parallel);
}

//------------------------------------------------------------------------------
// Transform a vector using a rotation expressed as a unit quaternion

inline XMVECTOR XM_CALLCONV XMVector3Rotate
(
    FXMVECTOR V,
    FXMVECTOR RotationQuaternion
) noexcept
{
    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
    XMVECTOR Result = XMQuaternionMultiply(Q, A);
    return XMQuaternionMultiply(Result, RotationQuaternion);
}

//------------------------------------------------------------------------------
// Transform a vector using the inverse of a rotation expressed as a unit quaternion

inline XMVECTOR XM_CALLCONV XMVector3InverseRotate
(
    FXMVECTOR V,
    FXMVECTOR RotationQuaternion
) noexcept
{
    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
    return XMQuaternionMultiply(Result, Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Transform
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Z = XMVectorSplatZ(V);
    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    XMVECTOR vResult = vmlaq_lane_f32(M.r[3], M.r[0], VL, 0); // X
    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
    vResult = XM_FMADD_PS(vResult, M.r[2], M.r[3]);
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
#endif


inline XMFLOAT4* XM_CALLCONV XMVector3TransformStream
(
    XMFLOAT4* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT3* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT3));
    

    assert(OutputStride >= sizeof(XMFLOAT4));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        XMVECTOR Z = XMVectorSplatZ(V);
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
        Result = XMVectorMultiplyAdd(Y, row1, Result);
        Result = XMVectorMultiplyAdd(X, row0, Result);

        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT4)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT3) * 4;

                float32x2_t r3 = vget_low_f32(row3);
                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(row3);
                r = vget_high_f32(row0);
                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
                XMVECTOR vResult3 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                r = vget_low_f32(row2);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(row2);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
                vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

                float32x4x4_t R;
                R.val[0] = vResult0;
                R.val[1] = vResult1;
                R.val[2] = vResult2;
                R.val[3] = vResult3;

                vst4q_f32(reinterpret_cast<float*>(pOutputVector), R);
                pOutputVector += sizeof(XMFLOAT4) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        float32x2_t zero = vdup_n_f32(0);
        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
        pInputVector += InputStride;

        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z

        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if (InputStride == sizeof(XMFLOAT3))
        {
            if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
            {
                // Packed input, aligned output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
            else
            {
                // Packed input, unaligned output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);
                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
    {
        // Aligned output
        for (; i < VectorCount; ++i)
        {
            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
            pInputVector += InputStride;

            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);
            vTemp = _mm_add_ps(vTemp, vTemp3);

            XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTemp);
            pOutputVector += OutputStride;
        }
    }
    else
    {
        // Unaligned output
        for (; i < VectorCount; ++i)
        {
            XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
            pInputVector += InputStride;

            XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
            XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

            XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
            XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
            XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
            vTemp = _mm_add_ps(vTemp, vTemp2);
            vTemp = _mm_add_ps(vTemp, vTemp3);

            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTemp);
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3TransformCoord
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
    XMVECTOR Z = XMVectorSplatZ(V);
    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    XMVECTOR W = XMVectorSplatW(Result);
    return XMVectorDivide(Result, W);
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
#endif


inline XMFLOAT3* XM_CALLCONV XMVector3TransformCoordStream
(
    XMFLOAT3* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT3* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT3));
    

    assert(OutputStride >= sizeof(XMFLOAT3));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        XMVECTOR Z = XMVectorSplatZ(V);
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
        Result = XMVectorMultiplyAdd(Y, row1, Result);
        Result = XMVectorMultiplyAdd(X, row0, Result);

        XMVECTOR W = XMVectorSplatW(Result);

        Result = XMVectorDivide(Result, W);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT3) * 4;

                float32x2_t r3 = vget_low_f32(row3);
                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(row3);
                r = vget_high_f32(row0);
                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                r = vget_low_f32(row2);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(row2);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
                W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
                V.val[0] = vdivq_f32(vResult0, W);
                V.val[1] = vdivq_f32(vResult1, W);
                V.val[2] = vdivq_f32(vResult2, W);
#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);
                S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);

                V.val[0] = vmulq_f32(vResult0, Reciprocal);
                V.val[1] = vmulq_f32(vResult1, Reciprocal);
                V.val[2] = vmulq_f32(vResult2, Reciprocal);
#endif

                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT3) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        float32x2_t zero = vdup_n_f32(0);
        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
        pInputVector += InputStride;

        XMVECTOR vResult = vmlaq_lane_f32(row3, row0, VL, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z

        VH = vget_high_f32(vResult);
        XMVECTOR W = vdupq_lane_f32(VH, 1);

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
        vResult = vdivq_f32(vResult, W);
#else
        // 2 iterations of Newton-Raphson refinement of reciprocal for W
        float32x4_t Reciprocal = vrecpeq_f32(W);
        float32x4_t S = vrecpsq_f32(Reciprocal, W);
        Reciprocal = vmulq_f32(S, Reciprocal);
        S = vrecpsq_f32(Reciprocal, W);
        Reciprocal = vmulq_f32(S, Reciprocal);

        vResult = vmulq_f32(vResult, Reciprocal);
#endif

        VL = vget_low_f32(vResult);
        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if (InputStride == sizeof(XMFLOAT3))
        {
            if (OutputStride == sizeof(XMFLOAT3))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V2 = _mm_div_ps(vTemp, W);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V3 = _mm_div_ps(vTemp, W);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V4 = _mm_div_ps(vTemp, W);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V2 = _mm_div_ps(vTemp, W);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V3 = _mm_div_ps(vTemp, W);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, row2, row3);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                        V4 = _mm_div_ps(vTemp, W);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);
                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);
                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);
                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, row2, row3);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

                    vTemp = _mm_div_ps(vTemp, W);
                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

        XMVECTOR vTemp = XM_FMADD_PS(Z, row2, row3);
        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
        vTemp = _mm_add_ps(vTemp, vTemp2);
        vTemp = _mm_add_ps(vTemp, vTemp3);

        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));

        vTemp = _mm_div_ps(vTemp, W);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
        pOutputVector += OutputStride;
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3TransformNormal
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Z = XMVectorSplatZ(V);
    XMVECTOR Y = XMVectorSplatY(V);
    XMVECTOR X = XMVectorSplatX(V);

    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
    Result = XMVectorMultiplyAdd(X, M.r[0], Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X
    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
    return vmlaq_lane_f32(vResult, M.r[2], vget_high_f32(V), 0); // Z
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
    vResult = _mm_mul_ps(vResult, M.r[2]);
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
#endif


inline XMFLOAT3* XM_CALLCONV XMVector3TransformNormalStream
(
    XMFLOAT3* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT3* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT3));
    

    assert(OutputStride >= sizeof(XMFLOAT3));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        XMVECTOR Z = XMVectorSplatZ(V);
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiply(Z, row2);
        Result = XMVectorMultiplyAdd(Y, row1, Result);
        Result = XMVectorMultiplyAdd(X, row0, Result);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT3) * 4;

                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx

                XM_PREFETCH(pInputVector);

                r = vget_high_f32(row0);
                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                r = vget_low_f32(row2);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz
                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(row2);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

                V.val[0] = vResult0;
                V.val[1] = vResult1;
                V.val[2] = vResult2;

                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT3) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
        float32x2_t zero = vdup_n_f32(0);
        float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
        pInputVector += InputStride;

        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z

        VL = vget_low_f32(vResult);
        vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
        vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if (InputStride == sizeof(XMFLOAT3))
        {
            if (OutputStride == sizeof(XMFLOAT3))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V1 = _mm_add_ps(vTemp, vTemp3);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V2 = _mm_add_ps(vTemp, vTemp3);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V3 = _mm_add_ps(vTemp, vTemp3);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V4 = _mm_add_ps(vTemp, vTemp3);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V1 = _mm_add_ps(vTemp, vTemp3);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V2 = _mm_add_ps(vTemp, vTemp3);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V3 = _mm_add_ps(vTemp, vTemp3);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = _mm_mul_ps(Z, row2);
                        vTemp2 = _mm_mul_ps(Y, row1);
                        vTemp3 = _mm_mul_ps(X, row0);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        V4 = _mm_add_ps(vTemp, vTemp3);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = _mm_mul_ps(Z, row2);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = _mm_mul_ps(Z, row2);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = _mm_mul_ps(Z, row2);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = _mm_mul_ps(Z, row2);
                    vTemp2 = _mm_mul_ps(Y, row1);
                    vTemp3 = _mm_mul_ps(X, row0);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

        XMVECTOR vTemp = _mm_mul_ps(Z, row2);
        XMVECTOR vTemp2 = _mm_mul_ps(Y, row1);
        XMVECTOR vTemp3 = _mm_mul_ps(X, row0);
        vTemp = _mm_add_ps(vTemp, vTemp2);
        vTemp = _mm_add_ps(vTemp, vTemp3);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
        pOutputVector += OutputStride;
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Project
(
    FXMVECTOR V,
    float    ViewportX,
    float    ViewportY,
    float    ViewportWidth,
    float    ViewportHeight,
    float    ViewportMinZ,
    float    ViewportMaxZ,
    FXMMATRIX Projection,
    CXMMATRIX View,
    CXMMATRIX World
) noexcept
{
    const float HalfViewportWidth = ViewportWidth * 0.5f;
    const float HalfViewportHeight = ViewportHeight * 0.5f;

    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);

    XMVECTOR Result = XMVector3TransformCoord(V, Transform);

    Result = XMVectorMultiplyAdd(Result, Scale, Offset);

    return Result;
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
#endif


inline XMFLOAT3* XM_CALLCONV XMVector3ProjectStream
(
    XMFLOAT3* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT3* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    float           ViewportX,
    float           ViewportY,
    float           ViewportWidth,
    float           ViewportHeight,
    float           ViewportMinZ,
    float           ViewportMaxZ,
    FXMMATRIX     Projection,
    CXMMATRIX     View,
    CXMMATRIX     World
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT3));
    

    assert(OutputStride >= sizeof(XMFLOAT3));
    

#if defined(_XM_NO_INTRINSICS_)

    const float HalfViewportWidth = ViewportWidth * 0.5f;
    const float HalfViewportHeight = ViewportHeight * 0.5f;

    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));

        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
        Result = XMVectorMultiplyAdd(Result, Scale, Offset);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    const float HalfViewportWidth = ViewportWidth * 0.5f;
    const float HalfViewportHeight = ViewportHeight * 0.5f;

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
        {
            XMVECTOR ScaleX = vdupq_n_f32(HalfViewportWidth);
            XMVECTOR ScaleY = vdupq_n_f32(-HalfViewportHeight);
            XMVECTOR ScaleZ = vdupq_n_f32(ViewportMaxZ - ViewportMinZ);

            XMVECTOR OffsetX = vdupq_n_f32(ViewportX + HalfViewportWidth);
            XMVECTOR OffsetY = vdupq_n_f32(ViewportY + HalfViewportHeight);
            XMVECTOR OffsetZ = vdupq_n_f32(ViewportMinZ);

            for (size_t j = 0; j < four; ++j)
            {
                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT3) * 4;

                float32x2_t r3 = vget_low_f32(Transform.r[3]);
                float32x2_t r = vget_low_f32(Transform.r[0]);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(Transform.r[3]);
                r = vget_high_f32(Transform.r[0]);
                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), V.val[0], r, 0); // Cx+O
                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), V.val[0], r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(Transform.r[1]);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(Transform.r[1]);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy+O
                W = vmlaq_lane_f32(W, V.val[1], r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                r = vget_low_f32(Transform.r[2]);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz+M
                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(Transform.r[2]);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz+O
                W = vmlaq_lane_f32(W, V.val[2], r, 1); // Dx+Hy+Lz+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
                vResult0 = vdivq_f32(vResult0, W);
                vResult1 = vdivq_f32(vResult1, W);
                vResult2 = vdivq_f32(vResult2, W);
#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);
                S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);

                vResult0 = vmulq_f32(vResult0, Reciprocal);
                vResult1 = vmulq_f32(vResult1, Reciprocal);
                vResult2 = vmulq_f32(vResult2, Reciprocal);
#endif

                V.val[0] = vmlaq_f32(OffsetX, vResult0, ScaleX);
                V.val[1] = vmlaq_f32(OffsetY, vResult1, ScaleY);
                V.val[2] = vmlaq_f32(OffsetZ, vResult2, ScaleZ);

                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT3) * 4;

                i += 4;
            }
        }
    }

    if (i < VectorCount)
    {
        XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
        XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);

        for (; i < VectorCount; i++)
        {
            float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
            float32x2_t zero = vdup_n_f32(0);
            float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
            pInputVector += InputStride;

            XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X
            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y
            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z

            VH = vget_high_f32(vResult);
            XMVECTOR W = vdupq_lane_f32(VH, 1);

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
            vResult = vdivq_f32(vResult, W);
#else
            // 2 iterations of Newton-Raphson refinement of reciprocal for W
            float32x4_t Reciprocal = vrecpeq_f32(W);
            float32x4_t S = vrecpsq_f32(Reciprocal, W);
            Reciprocal = vmulq_f32(S, Reciprocal);
            S = vrecpsq_f32(Reciprocal, W);
            Reciprocal = vmulq_f32(S, Reciprocal);

            vResult = vmulq_f32(vResult, Reciprocal);
#endif

            vResult = vmlaq_f32(Offset, vResult, Scale);

            VL = vget_low_f32(vResult);
            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
            pOutputVector += OutputStride;
        }
    }

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    const float HalfViewportWidth = ViewportWidth * 0.5f;
    const float HalfViewportHeight = ViewportHeight * 0.5f;

    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if (InputStride == sizeof(XMFLOAT3))
        {
            if (OutputStride == sizeof(XMFLOAT3))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V1 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 2
                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V2 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 3
                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V3 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Result 4
                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        vTemp = _mm_div_ps(vTemp, W);
                        V4 = XM_FMADD_PS(vTemp, Scale, Offset);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);
                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);
                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);
                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);
                    vTemp = XM_FMADD_PS(vTemp, Scale, Offset);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        pInputVector += InputStride;

        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
        vTemp = _mm_add_ps(vTemp, vTemp2);
        vTemp = _mm_add_ps(vTemp, vTemp3);

        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
        vTemp = _mm_div_ps(vTemp, W);
        vTemp = XM_FMADD_PS(vTemp, Scale, Offset);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
        pOutputVector += OutputStride;
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector3Unproject
(
    FXMVECTOR V,
    float     ViewportX,
    float     ViewportY,
    float     ViewportWidth,
    float     ViewportHeight,
    float     ViewportMinZ,
    float     ViewportMaxZ,
    FXMMATRIX Projection,
    CXMMATRIX View,
    CXMMATRIX World
) noexcept
{
    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };

    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
    Scale = XMVectorReciprocal(Scale);

    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);
    Transform = XMMatrixInverse(nullptr, Transform);

    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);

    return XMVector3TransformCoord(Result, Transform);
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015 26019, "PREfast noise: Esp:1307" )
#endif


inline XMFLOAT3* XM_CALLCONV XMVector3UnprojectStream
(
    XMFLOAT3* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT3* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    float           ViewportX,
    float           ViewportY,
    float           ViewportWidth,
    float           ViewportHeight,
    float           ViewportMinZ,
    float           ViewportMaxZ,
    FXMMATRIX       Projection,
    CXMMATRIX       View,
    CXMMATRIX       World
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT3));
    

    assert(OutputStride >= sizeof(XMFLOAT3));
    

#if defined(_XM_NO_INTRINSICS_)

    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };

    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
    Scale = XMVectorReciprocal(Scale);

    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);
    Transform = XMMatrixInverse(nullptr, Transform);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));

        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);

        Result = XMVector3TransformCoord(Result, Transform);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), Result);

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);
    Transform = XMMatrixInverse(nullptr, Transform);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    float sx = 1.f / (ViewportWidth * 0.5f);
    float sy = 1.f / (-ViewportHeight * 0.5f);
    float sz = 1.f / (ViewportMaxZ - ViewportMinZ);

    float ox = (-ViewportX * sx) - 1.f;
    float oy = (-ViewportY * sy) + 1.f;
    float oz = (-ViewportMinZ * sz);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT3)) && (OutputStride == sizeof(XMFLOAT3)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x3_t V = vld3q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT3) * 4;

                XMVECTOR ScaleX = vdupq_n_f32(sx);
                XMVECTOR OffsetX = vdupq_n_f32(ox);
                XMVECTOR VX = vmlaq_f32(OffsetX, ScaleX, V.val[0]);

                float32x2_t r3 = vget_low_f32(Transform.r[3]);
                float32x2_t r = vget_low_f32(Transform.r[0]);
                XMVECTOR vResult0 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Ax+M
                XMVECTOR vResult1 = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Bx+N

                XM_PREFETCH(pInputVector);

                r3 = vget_high_f32(Transform.r[3]);
                r = vget_high_f32(Transform.r[0]);
                XMVECTOR vResult2 = vmlaq_lane_f32(vdupq_lane_f32(r3, 0), VX, r, 0); // Cx+O
                XMVECTOR W = vmlaq_lane_f32(vdupq_lane_f32(r3, 1), VX, r, 1); // Dx+P

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                XMVECTOR ScaleY = vdupq_n_f32(sy);
                XMVECTOR OffsetY = vdupq_n_f32(oy);
                XMVECTOR VY = vmlaq_f32(OffsetY, ScaleY, V.val[1]);

                r = vget_low_f32(Transform.r[1]);
                vResult0 = vmlaq_lane_f32(vResult0, VY, r, 0); // Ax+Ey+M
                vResult1 = vmlaq_lane_f32(vResult1, VY, r, 1); // Bx+Fy+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(Transform.r[1]);
                vResult2 = vmlaq_lane_f32(vResult2, VY, r, 0); // Cx+Gy+O
                W = vmlaq_lane_f32(W, VY, r, 1); // Dx+Hy+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                XMVECTOR ScaleZ = vdupq_n_f32(sz);
                XMVECTOR OffsetZ = vdupq_n_f32(oz);
                XMVECTOR VZ = vmlaq_f32(OffsetZ, ScaleZ, V.val[2]);

                r = vget_low_f32(Transform.r[2]);
                vResult0 = vmlaq_lane_f32(vResult0, VZ, r, 0); // Ax+Ey+Iz+M
                vResult1 = vmlaq_lane_f32(vResult1, VZ, r, 1); // Bx+Fy+Jz+N

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(Transform.r[2]);
                vResult2 = vmlaq_lane_f32(vResult2, VZ, r, 0); // Cx+Gy+Kz+O
                W = vmlaq_lane_f32(W, VZ, r, 1); // Dx+Hy+Lz+P

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
                V.val[0] = vdivq_f32(vResult0, W);
                V.val[1] = vdivq_f32(vResult1, W);
                V.val[2] = vdivq_f32(vResult2, W);
#else
                // 2 iterations of Newton-Raphson refinement of reciprocal
                float32x4_t Reciprocal = vrecpeq_f32(W);
                float32x4_t S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);
                S = vrecpsq_f32(Reciprocal, W);
                Reciprocal = vmulq_f32(S, Reciprocal);

                V.val[0] = vmulq_f32(vResult0, Reciprocal);
                V.val[1] = vmulq_f32(vResult1, Reciprocal);
                V.val[2] = vmulq_f32(vResult2, Reciprocal);
#endif

                vst3q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT3) * 4;

                i += 4;
            }
        }
    }

    if (i < VectorCount)
    {
        float32x2_t ScaleL = vcreate_f32(
            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sx))
            | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sy)) << 32));
        float32x2_t ScaleH = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&sz)));

        float32x2_t OffsetL = vcreate_f32(
            static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&ox))
            | (static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oy)) << 32));
        float32x2_t OffsetH = vcreate_f32(static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(&oz)));

        for (; i < VectorCount; i++)
        {
            float32x2_t VL = vld1_f32(reinterpret_cast<const float*>(pInputVector));
            float32x2_t zero = vdup_n_f32(0);
            float32x2_t VH = vld1_lane_f32(reinterpret_cast<const float*>(pInputVector) + 2, zero, 0);
            pInputVector += InputStride;

            VL = vmla_f32(OffsetL, VL, ScaleL);
            VH = vmla_f32(OffsetH, VH, ScaleH);

            XMVECTOR vResult = vmlaq_lane_f32(Transform.r[3], Transform.r[0], VL, 0); // X
            vResult = vmlaq_lane_f32(vResult, Transform.r[1], VL, 1); // Y
            vResult = vmlaq_lane_f32(vResult, Transform.r[2], VH, 0); // Z

            VH = vget_high_f32(vResult);
            XMVECTOR W = vdupq_lane_f32(VH, 1);

#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
            vResult = vdivq_f32(vResult, W);
#else
            // 2 iterations of Newton-Raphson refinement of reciprocal for W
            float32x4_t Reciprocal = vrecpeq_f32(W);
            float32x4_t S = vrecpsq_f32(Reciprocal, W);
            Reciprocal = vmulq_f32(S, Reciprocal);
            S = vrecpsq_f32(Reciprocal, W);
            Reciprocal = vmulq_f32(S, Reciprocal);

            vResult = vmulq_f32(vResult, Reciprocal);
#endif

            VL = vget_low_f32(vResult);
            vst1_f32(reinterpret_cast<float*>(pOutputVector), VL);
            vst1q_lane_f32(reinterpret_cast<float*>(pOutputVector) + 2, vResult, 2);
            pOutputVector += OutputStride;
        }
    }

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 D = { { { -1.0f, 1.0f, 0.0f, 0.0f } } };

    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
    Scale = XMVectorReciprocal(Scale);

    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
    Offset = _mm_mul_ps(Scale, Offset);
    Offset = _mm_add_ps(Offset, D);

    XMMATRIX Transform = XMMatrixMultiply(World, View);
    Transform = XMMatrixMultiply(Transform, Projection);
    Transform = XMMatrixInverse(nullptr, Transform);

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if (InputStride == sizeof(XMFLOAT3))
        {
            if (OutputStride == sizeof(XMFLOAT3))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        V1 = XM_FMADD_PS(V1, Scale, Offset);

                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        V2 = XM_FMADD_PS(V2, Scale, Offset);

                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V2 = _mm_div_ps(vTemp, W);

                        // Result 3
                        V3 = XM_FMADD_PS(V3, Scale, Offset);

                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V3 = _mm_div_ps(vTemp, W);

                        // Result 4
                        V4 = XM_FMADD_PS(V4, Scale, Offset);

                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V4 = _mm_div_ps(vTemp, W);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), V1);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
                else
                {
                    // Packed input, unaligned & packed output
                    for (size_t j = 0; j < four; ++j)
                    {
                        __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                        __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                        pInputVector += sizeof(XMFLOAT3) * 4;

                        // Unpack the 4 vectors (.w components are junk)
                        XM3UNPACK3INTO4(V1, L2, L3);

                        // Result 1
                        V1 = XM_FMADD_PS(V1, Scale, Offset);

                        XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                        XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                        XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V1 = _mm_div_ps(vTemp, W);

                        // Result 2
                        V2 = XM_FMADD_PS(V2, Scale, Offset);

                        Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V2 = _mm_div_ps(vTemp, W);

                        // Result 3
                        V3 = XM_FMADD_PS(V3, Scale, Offset);

                        Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V3 = _mm_div_ps(vTemp, W);

                        // Result 4
                        V4 = XM_FMADD_PS(V4, Scale, Offset);

                        Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                        Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                        X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                        vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                        vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                        vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                        vTemp = _mm_add_ps(vTemp, vTemp2);
                        vTemp = _mm_add_ps(vTemp, vTemp3);

                        W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                        V4 = _mm_div_ps(vTemp, W);

                        // Pack and store the vectors
                        XM3PACK4INTO3(vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), V1);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 16), vTemp);
                        _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector + 32), V3);
                        pOutputVector += sizeof(XMFLOAT3) * 4;
                        i += 4;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < four; ++j)
                {
                    __m128 V1 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    __m128 L2 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 16));
                    __m128 L3 = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector + 32));
                    pInputVector += sizeof(XMFLOAT3) * 4;

                    // Unpack the 4 vectors (.w components are junk)
                    XM3UNPACK3INTO4(V1, L2, L3);

                    // Result 1
                    V1 = XM_FMADD_PS(V1, Scale, Offset);

                    XMVECTOR Z = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 2, 2, 2));
                    XMVECTOR Y = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 1, 1));
                    XMVECTOR X = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 0));

                    XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 2
                    V2 = XM_FMADD_PS(V2, Scale, Offset);

                    Z = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V2, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 3
                    V3 = XM_FMADD_PS(V3, Scale, Offset);

                    Z = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    // Result 4
                    V4 = XM_FMADD_PS(V4, Scale, Offset);

                    Z = XM_PERMUTE_PS(V4, _MM_SHUFFLE(2, 2, 2, 2));
                    Y = XM_PERMUTE_PS(V4, _MM_SHUFFLE(1, 1, 1, 1));
                    X = XM_PERMUTE_PS(V4, _MM_SHUFFLE(0, 0, 0, 0));

                    vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
                    vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
                    vTemp3 = _mm_mul_ps(X, Transform.r[0]);
                    vTemp = _mm_add_ps(vTemp, vTemp2);
                    vTemp = _mm_add_ps(vTemp, vTemp3);

                    W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
                    vTemp = _mm_div_ps(vTemp, W);

                    XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
                    pOutputVector += OutputStride;

                    i += 4;
                }
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pInputVector));
        pInputVector += InputStride;

        V = _mm_mul_ps(V, Scale);
        V = _mm_add_ps(V, Offset);

        XMVECTOR Z = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
        XMVECTOR Y = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
        XMVECTOR X = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));

        XMVECTOR vTemp = XM_FMADD_PS(Z, Transform.r[2], Transform.r[3]);
        XMVECTOR vTemp2 = _mm_mul_ps(Y, Transform.r[1]);
        XMVECTOR vTemp3 = _mm_mul_ps(X, Transform.r[0]);
        vTemp = _mm_add_ps(vTemp, vTemp2);
        vTemp = _mm_add_ps(vTemp, vTemp3);

        XMVECTOR W = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(3, 3, 3, 3));
        vTemp = _mm_div_ps(vTemp, W);

        XMStoreFloat3(reinterpret_cast<XMFLOAT3*>(pOutputVector), vTemp);
        pOutputVector += OutputStride;
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

/****************************************************************************
 *
 * 4D Vector
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Comparison operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4Equal
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
#else
    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector4EqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    uint32_t CR = 0;

    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
        (V1.vector4_f32[3] == V2.vector4_f32[3]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
        (V1.vector4_f32[3] != V2.vector4_f32[3]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);

    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpeq_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp);
    uint32_t CR = 0;
    if (iTest == 0xf)     // All equal?
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (iTest == 0)  // All not equal?
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4EqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) == 0xf) != 0);
#else
    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector4EqualIntR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
        V1.vector4_u32[1] == V2.vector4_u32[1] &&
        V1.vector4_u32[2] == V2.vector4_u32[2] &&
        V1.vector4_u32[3] == V2.vector4_u32[3])
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
        V1.vector4_u32[1] != V2.vector4_u32[1] &&
        V1.vector4_u32[2] != V2.vector4_u32[2] &&
        V1.vector4_u32[3] != V2.vector4_u32[3])
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);

    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
    uint32_t CR = 0;
    if (iTest == 0xf)     // All equal?
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (iTest == 0)  // All not equal?
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

inline bool XM_CALLCONV XMVector4NearEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR Epsilon
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float dx, dy, dz, dw;

    dx = fabsf(V1.vector4_f32[0] - V2.vector4_f32[0]);
    dy = fabsf(V1.vector4_f32[1] - V2.vector4_f32[1]);
    dz = fabsf(V1.vector4_f32[2] - V2.vector4_f32[2]);
    dw = fabsf(V1.vector4_f32[3] - V2.vector4_f32[3]);
    return (((dx <= Epsilon.vector4_f32[0]) &&
        (dy <= Epsilon.vector4_f32[1]) &&
        (dz <= Epsilon.vector4_f32[2]) &&
        (dw <= Epsilon.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vDelta = vsubq_f32(V1, V2);
#if defined(_MSC_VER) && !defined(__clang__) && !defined(_ARM64_DISTINCT_NEON_TYPES)
    uint32x4_t vResult = vacleq_f32(vDelta, Epsilon);
#else
    uint32x4_t vResult = vcleq_f32(vabsq_f32(vDelta), Epsilon);
#endif
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Get the difference
    XMVECTOR vDelta = _mm_sub_ps(V1, V2);
    // Get the absolute value of the difference
    XMVECTOR vTemp = _mm_setzero_ps();
    vTemp = _mm_sub_ps(vTemp, vDelta);
    vTemp = _mm_max_ps(vTemp, vDelta);
    vTemp = _mm_cmple_ps(vTemp, Epsilon);
    return ((_mm_movemask_ps(vTemp) == 0xf) != 0);
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4NotEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpneq_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp)) != 0);
#else
    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4NotEqualInt
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vceqq_u32(vreinterpretq_u32_f32(V1), vreinterpretq_u32_f32(V2));
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1), _mm_castps_si128(V2));
    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp)) != 0xF) != 0);
#else
    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4Greater
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgtq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
#else
    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector4GreaterR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
        V1.vector4_f32[1] > V2.vector4_f32[1] &&
        V1.vector4_f32[2] > V2.vector4_f32[2] &&
        V1.vector4_f32[3] > V2.vector4_f32[3])
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
        V1.vector4_f32[1] <= V2.vector4_f32[1] &&
        V1.vector4_f32[2] <= V2.vector4_f32[2] &&
        V1.vector4_f32[3] <= V2.vector4_f32[3])
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgtq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);

    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    uint32_t CR = 0;
    XMVECTOR vTemp = _mm_cmpgt_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp);
    if (iTest == 0xf)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4GreaterOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgeq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
#else
    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
#endif
}

//------------------------------------------------------------------------------

inline uint32_t XM_CALLCONV XMVector4GreaterOrEqualR
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    uint32_t CR = 0;
    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
        (V1.vector4_f32[3] >= V2.vector4_f32[3]))
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
        (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
        (V1.vector4_f32[3] < V2.vector4_f32[3]))
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcgeq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);

    uint32_t CR = 0;
    if (r == 0xFFFFFFFFU)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!r)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#elif defined(_XM_SSE_INTRINSICS_)
    uint32_t CR = 0;
    XMVECTOR vTemp = _mm_cmpge_ps(V1, V2);
    int iTest = _mm_movemask_ps(vTemp);
    if (iTest == 0x0f)
    {
        CR = XM_CRMASK_CR6TRUE;
    }
    else if (!iTest)
    {
        CR = XM_CRMASK_CR6FALSE;
    }
    return CR;
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4Less
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcltq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmplt_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
#else
    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4LessOrEqual
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vResult = vcleq_f32(V1, V2);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vResult)), vget_high_u8(vreinterpretq_u8_u32(vResult)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp = _mm_cmple_ps(V1, V2);
    return ((_mm_movemask_ps(vTemp) == 0x0f) != 0);
#else
    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
#endif
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4InBounds
(
    FXMVECTOR V,
    FXMVECTOR Bounds
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
        (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test if less than or equal
    uint32x4_t ivTemp1 = vcleq_f32(V, Bounds);
    // Negate the bounds
    float32x4_t vTemp2 = vnegq_f32(Bounds);
    // Test if greater or equal (Reversed)
    uint32x4_t ivTemp2 = vcleq_f32(vTemp2, V);
    // Blend answers
    ivTemp1 = vandq_u32(ivTemp1, ivTemp2);
    // in bounds?
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(ivTemp1)), vget_high_u8(vreinterpretq_u8_u32(ivTemp1)));
    uint16x4x2_t vTemp3 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp3.val[1]), 1) == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test if less than or equal
    XMVECTOR vTemp1 = _mm_cmple_ps(V, Bounds);
    // Negate the bounds
    XMVECTOR vTemp2 = _mm_mul_ps(Bounds, g_XMNegativeOne);
    // Test if greater or equal (Reversed)
    vTemp2 = _mm_cmple_ps(vTemp2, V);
    // Blend answers
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
    // All in bounds?
    return ((_mm_movemask_ps(vTemp1) == 0x0f) != 0);
#else
    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
#endif
}

//------------------------------------------------------------------------------

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

inline bool XM_CALLCONV XMVector4IsNaN(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    return (XMISNAN(V.vector4_f32[0]) ||
        XMISNAN(V.vector4_f32[1]) ||
        XMISNAN(V.vector4_f32[2]) ||
        XMISNAN(V.vector4_f32[3]));
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Test against itself. NaN is always not equal
    uint32x4_t vTempNan = vceqq_f32(V, V);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempNan)), vget_high_u8(vreinterpretq_u8_u32(vTempNan)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    // If any are NaN, the mask is zero
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    // Test against itself. NaN is always not equal
    XMVECTOR vTempNan = _mm_cmpneq_ps(V, V);
    // If any are NaN, the mask is non-zero
    return (_mm_movemask_ps(vTempNan) != 0);
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMVector4IsInfinite(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    return (XMISINF(V.vector4_f32[0]) ||
        XMISINF(V.vector4_f32[1]) ||
        XMISINF(V.vector4_f32[2]) ||
        XMISINF(V.vector4_f32[3]));

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Mask off the sign bit
    uint32x4_t vTempInf = vandq_u32(vreinterpretq_u32_f32(V), g_XMAbsMask);
    // Compare to infinity
    vTempInf = vceqq_f32(vreinterpretq_f32_u32(vTempInf), g_XMInfinity);
    // If any are infinity, the signs are true.
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(vTempInf)), vget_high_u8(vreinterpretq_u8_u32(vTempInf)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    return (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) != 0);
#elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bit
    XMVECTOR vTemp = _mm_and_ps(V, g_XMAbsMask);
    // Compare to infinity
    vTemp = _mm_cmpeq_ps(vTemp, g_XMInfinity);
    // If any are infinity, the signs are true.
    return (_mm_movemask_ps(vTemp) != 0);
#endif
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Dot
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result;
    Result.f[0] =
        Result.f[1] =
        Result.f[2] =
        Result.f[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t vTemp = vmulq_f32(V1, V2);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    return vcombine_f32(v1, v1);
#elif defined(_XM_SSE4_INTRINSICS_)
    return _mm_dp_ps(V1, V2, 0xff);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vTemp = _mm_mul_ps(V1, V2);
    vTemp = _mm_hadd_ps(vTemp, vTemp);
    return _mm_hadd_ps(vTemp, vTemp);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp2 = V2;
    XMVECTOR vTemp = _mm_mul_ps(V1, vTemp2);
    vTemp2 = _mm_shuffle_ps(vTemp2, vTemp, _MM_SHUFFLE(1, 0, 0, 0)); // Copy X to the Z position and Y to the W position
    vTemp2 = _mm_add_ps(vTemp2, vTemp);          // Add Z = X+Z; W = Y+W;
    vTemp = _mm_shuffle_ps(vTemp, vTemp2, _MM_SHUFFLE(0, 3, 0, 0));  // Copy W to the Z position
    vTemp = _mm_add_ps(vTemp, vTemp2);           // Add Z and W together
    return XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(2, 2, 2, 2));    // Splat Z and return
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Cross
(
    FXMVECTOR V1,
    FXMVECTOR V2,
    FXMVECTOR V3
) noexcept
{
    // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]

#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            (((V2.vector4_f32[2] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[2])) * V1.vector4_f32[1]) - (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[2]) + (((V2.vector4_f32[1] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[1])) * V1.vector4_f32[3]),
            (((V2.vector4_f32[3] * V3.vector4_f32[2]) - (V2.vector4_f32[2] * V3.vector4_f32[3])) * V1.vector4_f32[0]) - (((V2.vector4_f32[3] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[3])) * V1.vector4_f32[2]) + (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[3]),
            (((V2.vector4_f32[1] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[1])) * V1.vector4_f32[0]) - (((V2.vector4_f32[0] * V3.vector4_f32[3]) - (V2.vector4_f32[3] * V3.vector4_f32[0])) * V1.vector4_f32[1]) + (((V2.vector4_f32[0] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[0])) * V1.vector4_f32[3]),
            (((V2.vector4_f32[2] * V3.vector4_f32[1]) - (V2.vector4_f32[1] * V3.vector4_f32[2])) * V1.vector4_f32[0]) - (((V2.vector4_f32[2] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[2])) * V1.vector4_f32[1]) + (((V2.vector4_f32[1] * V3.vector4_f32[0]) - (V2.vector4_f32[0] * V3.vector4_f32[1])) * V1.vector4_f32[2]),
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    const uint32x2_t select = vget_low_u32(g_XMMaskX);

    // Term1: V2zwyz * V3wzwy
    const float32x2_t v2xy = vget_low_f32(V2);
    const float32x2_t v2zw = vget_high_f32(V2);
    const float32x2_t v2yx = vrev64_f32(v2xy);
    const float32x2_t v2wz = vrev64_f32(v2zw);
    const float32x2_t v2yz = vbsl_f32(select, v2yx, v2wz);

    const float32x2_t v3zw = vget_high_f32(V3);
    const float32x2_t v3wz = vrev64_f32(v3zw);
    const float32x2_t v3xy = vget_low_f32(V3);
    const float32x2_t v3wy = vbsl_f32(select, v3wz, v3xy);

    float32x4_t vTemp1 = vcombine_f32(v2zw, v2yz);
    float32x4_t vTemp2 = vcombine_f32(v3wz, v3wy);
    XMVECTOR vResult = vmulq_f32(vTemp1, vTemp2);

    // - V2wzwy * V3zwyz
    const float32x2_t v2wy = vbsl_f32(select, v2wz, v2xy);

    const float32x2_t v3yx = vrev64_f32(v3xy);
    const float32x2_t v3yz = vbsl_f32(select, v3yx, v3wz);

    vTemp1 = vcombine_f32(v2wz, v2wy);
    vTemp2 = vcombine_f32(v3zw, v3yz);
    vResult = vmlsq_f32(vResult, vTemp1, vTemp2);

    // term1 * V1yxxx
    const float32x2_t v1xy = vget_low_f32(V1);
    const float32x2_t v1yx = vrev64_f32(v1xy);

    vTemp1 = vcombine_f32(v1yx, vdup_lane_f32(v1yx, 1));
    vResult = vmulq_f32(vResult, vTemp1);

    // Term2: V2ywxz * V3wxwx
    const float32x2_t v2yw = vrev64_f32(v2wy);
    const float32x2_t v2xz = vbsl_f32(select, v2xy, v2wz);

    const float32x2_t v3wx = vbsl_f32(select, v3wz, v3yx);

    vTemp1 = vcombine_f32(v2yw, v2xz);
    vTemp2 = vcombine_f32(v3wx, v3wx);
    float32x4_t vTerm = vmulq_f32(vTemp1, vTemp2);

    // - V2wxwx * V3ywxz
    const float32x2_t v2wx = vbsl_f32(select, v2wz, v2yx);

    const float32x2_t v3yw = vrev64_f32(v3wy);
    const float32x2_t v3xz = vbsl_f32(select, v3xy, v3wz);

    vTemp1 = vcombine_f32(v2wx, v2wx);
    vTemp2 = vcombine_f32(v3yw, v3xz);
    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);

    // vResult - term2 * V1zzyy
    const float32x2_t v1zw = vget_high_f32(V1);

    vTemp1 = vcombine_f32(vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0));
    vResult = vmlsq_f32(vResult, vTerm, vTemp1);

    // Term3: V2yzxy * V3zxyx
    const float32x2_t v3zx = vrev64_f32(v3xz);

    vTemp1 = vcombine_f32(v2yz, v2xy);
    vTemp2 = vcombine_f32(v3zx, v3yx);
    vTerm = vmulq_f32(vTemp1, vTemp2);

    // - V2zxyx * V3yzxy
    const float32x2_t v2zx = vrev64_f32(v2xz);

    vTemp1 = vcombine_f32(v2zx, v2yx);
    vTemp2 = vcombine_f32(v3yz, v3xy);
    vTerm = vmlsq_f32(vTerm, vTemp1, vTemp2);

    // vResult + term3 * V1wwwz
    const float32x2_t v1wz = vrev64_f32(v1zw);

    vTemp1 = vcombine_f32(vdup_lane_f32(v1wz, 0), v1wz);
    return vmlaq_f32(vResult, vTerm, vTemp1);
#elif defined(_XM_SSE_INTRINSICS_)
    // V2zwyz * V3wzwy
    XMVECTOR vResult = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 1, 3, 2));
    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 3, 2, 3));
    vResult = _mm_mul_ps(vResult, vTemp3);
    // - V2wzwy * V3zwyz
    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 3, 2, 3));
    vTemp3 = XM_PERMUTE_PS(vTemp3, _MM_SHUFFLE(1, 3, 0, 1));
    vResult = XM_FNMADD_PS(vTemp2, vTemp3, vResult);
     // term1 * V1yxxx
    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 0, 0, 1));
    vResult = _mm_mul_ps(vResult, vTemp1);

    // V2ywxz * V3wxwx
    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 3, 1));
    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 3, 0, 3));
    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
    // - V2wxwx * V3ywxz
    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 1, 2, 1));
    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(2, 0, 3, 1));
    vTemp3 = XM_FNMADD_PS(vTemp2, vTemp1, vTemp3);
    // vResult - temp * V1zzyy
    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(1, 1, 2, 2));
    vResult = XM_FNMADD_PS(vTemp1, vTemp3, vResult);

    // V2yzxy * V3zxyx
    vTemp2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(1, 0, 2, 1));
    vTemp3 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(0, 1, 0, 2));
    vTemp3 = _mm_mul_ps(vTemp3, vTemp2);
    // - V2zxyx * V3yzxy
    vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(2, 0, 2, 1));
    vTemp1 = XM_PERMUTE_PS(V3, _MM_SHUFFLE(1, 0, 2, 1));
    vTemp3 = XM_FNMADD_PS(vTemp1, vTemp2, vTemp3);
    // vResult + term * V1wwwz
    vTemp1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 3, 3, 3));
    vResult = XM_FMADD_PS(vTemp3, vTemp1, vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4LengthSq(FXMVECTOR V) noexcept
{
    return XMVector4Dot(V, V);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector4LengthSq(V);
    Result = XMVectorReciprocalSqrtEst(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32(v1);
    return vcombine_f32(v2, v2);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    return _mm_rsqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_rsqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Get the reciprocal
    vLengthSq = _mm_rsqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector4LengthSq(V);
    Result = XMVectorReciprocalSqrt(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    // Reciprocal sqrt
    float32x2_t  S0 = vrsqrte_f32(v1);
    float32x2_t  P0 = vmul_f32(v1, S0);
    float32x2_t  R0 = vrsqrts_f32(P0, S0);
    float32x2_t  S1 = vmul_f32(S0, R0);
    float32x2_t  P1 = vmul_f32(v1, S1);
    float32x2_t  R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    XMVECTOR vLengthSq = _mm_sqrt_ps(vTemp);
    return _mm_div_ps(g_XMOne, vLengthSq);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Get the reciprocal
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    // Accurate!
    vLengthSq = _mm_div_ps(g_XMOne, vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4LengthEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector4LengthSq(V);
    Result = XMVectorSqrtEst(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
    // Sqrt (estimate)
    float32x2_t Result = vrsqrte_f32(v1);
    Result = vmul_f32(v1, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Length(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;

    Result = XMVector4LengthSq(V);
    Result = XMVectorSqrt(Result);

    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    const float32x2_t zero = vdup_n_f32(0);
    uint32x2_t VEqualsZero = vceq_f32(v1, zero);
    // Sqrt
    float32x2_t S0 = vrsqrte_f32(v1);
    float32x2_t P0 = vmul_f32(v1, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(v1, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    float32x2_t Result = vmul_f32(S1, R1);
    Result = vmul_f32(v1, Result);
    Result = vbsl_f32(VEqualsZero, zero, Result);
    return vcombine_f32(Result, Result);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    return _mm_sqrt_ps(vTemp);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Get the length
    vLengthSq = _mm_sqrt_ps(vLengthSq);
    return vLengthSq;
#endif
}

//------------------------------------------------------------------------------
// XMVector4NormalizeEst uses a reciprocal estimate and
// returns QNaN on zero and infinite vectors.

inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR Result;
    Result = XMVector4ReciprocalLength(V);
    Result = XMVectorMultiply(V, Result);
    return Result;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    // Reciprocal sqrt (estimate)
    v2 = vrsqrte_f32(v1);
    // Normalize
    return vmulq_f32(V, vcombine_f32(v2, v2));
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(V, V, 0xff);
    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
    return _mm_mul_ps(vResult, V);
#elif defined(_XM_SSE3_INTRINSICS_)
    XMVECTOR vDot = _mm_mul_ps(V, V);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_hadd_ps(vDot, vDot);
    vDot = _mm_rsqrt_ps(vDot);
    vDot = _mm_mul_ps(vDot, V);
    return vDot;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Get the reciprocal
    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
    // Reciprocal mul to perform the normalization
    vResult = _mm_mul_ps(vResult, V);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Normalize(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float fLength;
    XMVECTOR vResult;

    vResult = XMVector4Length(V);
    fLength = vResult.vector4_f32[0];

    // Prevent divide by zero
    if (fLength > 0)
    {
        fLength = 1.0f / fLength;
    }

    vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
    vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
    vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
    vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
    return vResult;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Dot4
    float32x4_t vTemp = vmulq_f32(V, V);
    float32x2_t v1 = vget_low_f32(vTemp);
    float32x2_t v2 = vget_high_f32(vTemp);
    v1 = vadd_f32(v1, v2);
    v1 = vpadd_f32(v1, v1);
    uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
    uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
    // Reciprocal sqrt (2 iterations of Newton-Raphson)
    float32x2_t S0 = vrsqrte_f32(v1);
    float32x2_t P0 = vmul_f32(v1, S0);
    float32x2_t R0 = vrsqrts_f32(P0, S0);
    float32x2_t S1 = vmul_f32(S0, R0);
    float32x2_t P1 = vmul_f32(v1, S1);
    float32x2_t R1 = vrsqrts_f32(P1, S1);
    v2 = vmul_f32(S1, R1);
    // Normalize
    XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
    vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult);
    return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0xff);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE3_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y,z and w
    XMVECTOR vLengthSq = _mm_mul_ps(V, V);
    // vTemp has z and w
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(3, 2, 3, 2));
    // x+z, y+w
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // x+z,x+z,x+z,y+w
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(1, 0, 0, 0));
    // ??,??,y+w,y+w
    vTemp = _mm_shuffle_ps(vTemp, vLengthSq, _MM_SHUFFLE(3, 3, 0, 0));
    // ??,??,x+z+y+w,??
    vLengthSq = _mm_add_ps(vLengthSq, vTemp);
    // Splat the length
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 2, 2, 2));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Create zero with a single instruction
    XMVECTOR vZeroMask = _mm_setzero_ps();
    // Test for a divide by zero (Must be FP to detect -0.0)
    vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Divide to perform the normalization
    vResult = _mm_div_ps(V, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vZeroMask);
    // Select qnan or result based on infinite length
    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
    XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
    vResult = _mm_or_ps(vTemp1, vTemp2);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4ClampLength
(
    FXMVECTOR V,
    float    LengthMin,
    float    LengthMax
) noexcept
{
    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);

    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4ClampLengthV
(
    FXMVECTOR V,
    FXMVECTOR LengthMin,
    FXMVECTOR LengthMax
) noexcept
{
    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));

    XMVECTOR LengthSq = XMVector4LengthSq(V);

    const XMVECTOR Zero = XMVectorZero();

    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);

    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);

    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);

    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);

    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
    Length = XMVectorSelect(LengthSq, Length, Select);
    Normal = XMVectorSelect(LengthSq, Normal, Select);

    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);

    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);

    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);

    // Preserve the original vector (with no precision loss) if the length falls within the given range
    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
    Result = XMVectorSelect(Result, V, Control);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Reflect
(
    FXMVECTOR Incident,
    FXMVECTOR Normal
) noexcept
{
    // Result = Incident - (2 * dot(Incident, Normal)) * Normal

    XMVECTOR Result = XMVector4Dot(Incident, Normal);
    Result = XMVectorAdd(Result, Result);
    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Refract
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    float    RefractionIndex
) noexcept
{
    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
    return XMVector4RefractV(Incident, Normal, Index);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4RefractV
(
    FXMVECTOR Incident,
    FXMVECTOR Normal,
    FXMVECTOR RefractionIndex
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTOR        IDotN;
    XMVECTOR        R;
    const XMVECTOR  Zero = XMVectorZero();

    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))

    IDotN = XMVector4Dot(Incident, Normal);

    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
    R = XMVectorMultiply(R, RefractionIndex);
    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);

    if (XMVector4LessOrEqual(R, Zero))
    {
        // Total internal reflection
        return Zero;
    }
    else
    {
        XMVECTOR Result;

        // R = RefractionIndex * IDotN + sqrt(R)
        R = XMVectorSqrt(R);
        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);

        // Result = RefractionIndex * Incident - Normal * R
        Result = XMVectorMultiply(RefractionIndex, Incident);
        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);

        return Result;
    }

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);

    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    float32x4_t R = vmlsq_f32(g_XMOne, IDotN, IDotN);
    R = vmulq_f32(R, RefractionIndex);
    R = vmlsq_f32(g_XMOne, R, RefractionIndex);

    uint32x4_t isrzero = vcleq_f32(R, g_XMZero);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(isrzero)), vget_high_u8(vreinterpretq_u8_u32(isrzero)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));

    float32x4_t vResult;
    if (vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1) == 0xFFFFFFFFU)
    {
        // Total internal reflection
        vResult = g_XMZero;
    }
    else
    {
        // Sqrt(R)
        float32x4_t S0 = vrsqrteq_f32(R);
        float32x4_t P0 = vmulq_f32(R, S0);
        float32x4_t R0 = vrsqrtsq_f32(P0, S0);
        float32x4_t S1 = vmulq_f32(S0, R0);
        float32x4_t P1 = vmulq_f32(R, S1);
        float32x4_t R1 = vrsqrtsq_f32(P1, S1);
        float32x4_t S2 = vmulq_f32(S1, R1);
        R = vmulq_f32(R, S2);
        // R = RefractionIndex * IDotN + sqrt(R)
        R = vmlaq_f32(R, RefractionIndex, IDotN);
        // Result = RefractionIndex * Incident - Normal * R
        vResult = vmulq_f32(RefractionIndex, Incident);
        vResult = vmlsq_f32(vResult, R, Normal);
    }
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR IDotN = XMVector4Dot(Incident, Normal);

    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
    XMVECTOR R = XM_FNMADD_PS(IDotN, IDotN, g_XMOne);
    XMVECTOR R2 = _mm_mul_ps(RefractionIndex, RefractionIndex);
    R = XM_FNMADD_PS(R, R2, g_XMOne);

    XMVECTOR vResult = _mm_cmple_ps(R, g_XMZero);
    if (_mm_movemask_ps(vResult) == 0x0f)
    {
        // Total internal reflection
        vResult = g_XMZero;
    }
    else
    {
        // R = RefractionIndex * IDotN + sqrt(R)
        R = _mm_sqrt_ps(R);
        R = XM_FMADD_PS(RefractionIndex, IDotN, R);
        // Result = RefractionIndex * Incident - Normal * R
        vResult = _mm_mul_ps(RefractionIndex, Incident);
        vResult = XM_FNMADD_PS(R, Normal, vResult);
    }
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Orthogonal(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 Result = { { {
            V.vector4_f32[2],
            V.vector4_f32[3],
            -V.vector4_f32[0],
            -V.vector4_f32[1]
        } } };
    return Result.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Negate = { { { 1.f, 1.f, -1.f, -1.f } } };

    float32x4_t Result = vcombine_f32(vget_high_f32(V), vget_low_f32(V));
    return vmulq_f32(Result, Negate);
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 FlipZW = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 0, 3, 2));
    vResult = _mm_mul_ps(vResult, FlipZW);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormalsEst
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector4Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
    Result = XMVectorACosEst(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenNormals
(
    FXMVECTOR N1,
    FXMVECTOR N2
) noexcept
{
    XMVECTOR Result = XMVector4Dot(N1, N2);
    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
    Result = XMVectorACos(Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4AngleBetweenVectors
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
    XMVECTOR L2 = XMVector4ReciprocalLength(V2);

    XMVECTOR Dot = XMVector4Dot(V1, V2);

    L1 = XMVectorMultiply(L1, L2);

    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);

    return XMVectorACos(CosAngle);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMVector4Transform
(
    FXMVECTOR V,
    FXMMATRIX M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float fX = (M.m[0][0] * V.vector4_f32[0]) + (M.m[1][0] * V.vector4_f32[1]) + (M.m[2][0] * V.vector4_f32[2]) + (M.m[3][0] * V.vector4_f32[3]);
    float fY = (M.m[0][1] * V.vector4_f32[0]) + (M.m[1][1] * V.vector4_f32[1]) + (M.m[2][1] * V.vector4_f32[2]) + (M.m[3][1] * V.vector4_f32[3]);
    float fZ = (M.m[0][2] * V.vector4_f32[0]) + (M.m[1][2] * V.vector4_f32[1]) + (M.m[2][2] * V.vector4_f32[2]) + (M.m[3][2] * V.vector4_f32[3]);
    float fW = (M.m[0][3] * V.vector4_f32[0]) + (M.m[1][3] * V.vector4_f32[1]) + (M.m[2][3] * V.vector4_f32[2]) + (M.m[3][3] * V.vector4_f32[3]);
    XMVECTORF32 vResult = { { { fX, fY, fZ, fW } } };
    return vResult.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(V);
    XMVECTOR vResult = vmulq_lane_f32(M.r[0], VL, 0); // X
    vResult = vmlaq_lane_f32(vResult, M.r[1], VL, 1); // Y
    float32x2_t VH = vget_high_f32(V);
    vResult = vmlaq_lane_f32(vResult, M.r[2], VH, 0); // Z
    return vmlaq_lane_f32(vResult, M.r[3], VH, 1); // W
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vResult = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3)); // W
    vResult = _mm_mul_ps(vResult, M.r[3]);
    XMVECTOR vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2)); // Z
    vResult = XM_FMADD_PS(vTemp, M.r[2], vResult);
    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1)); // Y
    vResult = XM_FMADD_PS(vTemp, M.r[1], vResult);
    vTemp = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0)); // X
    vResult = XM_FMADD_PS(vTemp, M.r[0], vResult);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMFLOAT4* XM_CALLCONV XMVector4TransformStream
(
    XMFLOAT4* pOutputStream,
    size_t          OutputStride,
    const XMFLOAT4* pInputStream,
    size_t          InputStride,
    size_t          VectorCount,
    FXMMATRIX       M
) noexcept
{
    assert(pOutputStream != nullptr);
    assert(pInputStream != nullptr);

    assert(InputStride >= sizeof(XMFLOAT4));
    

    assert(OutputStride >= sizeof(XMFLOAT4));
    

#if defined(_XM_NO_INTRINSICS_)

    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    for (size_t i = 0; i < VectorCount; i++)
    {
        XMVECTOR V = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pInputVector));
        XMVECTOR W = XMVectorSplatW(V);
        XMVECTOR Z = XMVectorSplatZ(V);
        XMVECTOR Y = XMVectorSplatY(V);
        XMVECTOR X = XMVectorSplatX(V);

        XMVECTOR Result = XMVectorMultiply(W, row3);
        Result = XMVectorMultiplyAdd(Z, row2, Result);
        Result = XMVectorMultiplyAdd(Y, row1, Result);
        Result = XMVectorMultiplyAdd(X, row0, Result);

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable : 26015, "PREfast noise: Esp:1307" )
#endif

        XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(pOutputVector), Result);

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

        pInputVector += InputStride;
        pOutputVector += OutputStride;
    }

    return pOutputStream;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    size_t i = 0;
    size_t four = VectorCount >> 2;
    if (four > 0)
    {
        if ((InputStride == sizeof(XMFLOAT4)) && (OutputStride == sizeof(XMFLOAT4)))
        {
            for (size_t j = 0; j < four; ++j)
            {
                float32x4x4_t V = vld4q_f32(reinterpret_cast<const float*>(pInputVector));
                pInputVector += sizeof(XMFLOAT4) * 4;

                float32x2_t r = vget_low_f32(row0);
                XMVECTOR vResult0 = vmulq_lane_f32(V.val[0], r, 0); // Ax
                XMVECTOR vResult1 = vmulq_lane_f32(V.val[0], r, 1); // Bx

                XM_PREFETCH(pInputVector);

                r = vget_high_f32(row0);
                XMVECTOR vResult2 = vmulq_lane_f32(V.val[0], r, 0); // Cx
                XMVECTOR vResult3 = vmulq_lane_f32(V.val[0], r, 1); // Dx

                XM_PREFETCH(pInputVector + XM_CACHE_LINE_SIZE);

                r = vget_low_f32(row1);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[1], r, 0); // Ax+Ey
                vResult1 = vmlaq_lane_f32(vResult1, V.val[1], r, 1); // Bx+Fy

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 2));

                r = vget_high_f32(row1);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[1], r, 0); // Cx+Gy
                vResult3 = vmlaq_lane_f32(vResult3, V.val[1], r, 1); // Dx+Hy

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 3));

                r = vget_low_f32(row2);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[2], r, 0); // Ax+Ey+Iz
                vResult1 = vmlaq_lane_f32(vResult1, V.val[2], r, 1); // Bx+Fy+Jz

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 4));

                r = vget_high_f32(row2);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[2], r, 0); // Cx+Gy+Kz
                vResult3 = vmlaq_lane_f32(vResult3, V.val[2], r, 1); // Dx+Hy+Lz

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 5));

                r = vget_low_f32(row3);
                vResult0 = vmlaq_lane_f32(vResult0, V.val[3], r, 0); // Ax+Ey+Iz+Mw
                vResult1 = vmlaq_lane_f32(vResult1, V.val[3], r, 1); // Bx+Fy+Jz+Nw

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 6));

                r = vget_high_f32(row3);
                vResult2 = vmlaq_lane_f32(vResult2, V.val[3], r, 0); // Cx+Gy+Kz+Ow
                vResult3 = vmlaq_lane_f32(vResult3, V.val[3], r, 1); // Dx+Hy+Lz+Pw

                XM_PREFETCH(pInputVector + (XM_CACHE_LINE_SIZE * 7));

                V.val[0] = vResult0;
                V.val[1] = vResult1;
                V.val[2] = vResult2;
                V.val[3] = vResult3;

                vst4q_f32(reinterpret_cast<float*>(pOutputVector), V);
                pOutputVector += sizeof(XMFLOAT4) * 4;

                i += 4;
            }
        }
    }

    for (; i < VectorCount; i++)
    {
        XMVECTOR V = vld1q_f32(reinterpret_cast<const float*>(pInputVector));
        pInputVector += InputStride;

        float32x2_t VL = vget_low_f32(V);
        XMVECTOR vResult = vmulq_lane_f32(row0, VL, 0); // X
        vResult = vmlaq_lane_f32(vResult, row1, VL, 1); // Y
        float32x2_t VH = vget_high_f32(V);
        vResult = vmlaq_lane_f32(vResult, row2, VH, 0); // Z
        vResult = vmlaq_lane_f32(vResult, row3, VH, 1); // W

        vst1q_f32(reinterpret_cast<float*>(pOutputVector), vResult);
        pOutputVector += OutputStride;
    }

    return pOutputStream;
#elif defined(_XM_AVX2_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    size_t i = 0;
    size_t two = VectorCount >> 1;
    if (two > 0)
    {
        __m256 row0 = _mm256_broadcast_ps(&M.r[0]);
        __m256 row1 = _mm256_broadcast_ps(&M.r[1]);
        __m256 row2 = _mm256_broadcast_ps(&M.r[2]);
        __m256 row3 = _mm256_broadcast_ps(&M.r[3]);

        if (InputStride == sizeof(XMFLOAT4))
        {
            if (OutputStride == sizeof(XMFLOAT4))
            {
                if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0x1F))
                {
                    // Packed input, aligned & packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT4) * 2;

                        __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
                        __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));

                        vTempX = _mm256_mul_ps(vTempX, row0);
                        vTempY = _mm256_mul_ps(vTempY, row1);
                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
                        vTempX = _mm256_add_ps(vTempZ, vTempW);

                        XM256_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        i += 2;
                    }
                }
                else
                {
                    // Packed input, packed output
                    for (size_t j = 0; j < two; ++j)
                    {
                        __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                        pInputVector += sizeof(XMFLOAT4) * 2;

                        __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
                        __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                        __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                        __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));

                        vTempX = _mm256_mul_ps(vTempX, row0);
                        vTempY = _mm256_mul_ps(vTempY, row1);
                        vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
                        vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
                        vTempX = _mm256_add_ps(vTempZ, vTempW);

                        _mm256_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
                        pOutputVector += sizeof(XMFLOAT4) * 2;

                        i += 2;
                    }
                }
            }
            else
            {
                // Packed input, unpacked output
                for (size_t j = 0; j < two; ++j)
                {
                    __m256 VV = _mm256_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                    pInputVector += sizeof(XMFLOAT4) * 2;

                    __m256 vTempX = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(0, 0, 0, 0));
                    __m256 vTempY = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(1, 1, 1, 1));
                    __m256 vTempZ = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(2, 2, 2, 2));
                    __m256 vTempW = _mm256_shuffle_ps(VV, VV, _MM_SHUFFLE(3, 3, 3, 3));

                    vTempX = _mm256_mul_ps(vTempX, row0);
                    vTempY = _mm256_mul_ps(vTempY, row1);
                    vTempZ = _mm256_fmadd_ps(vTempZ, row2, vTempX);
                    vTempW = _mm256_fmadd_ps(vTempW, row3, vTempY);
                    vTempX = _mm256_add_ps(vTempZ, vTempW);

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_castps256_ps128(vTempX));
                    pOutputVector += OutputStride;

                    _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), _mm256_extractf128_ps(vTempX, 1));
                    pOutputVector += OutputStride;
                    i += 2;
                }
            }
        }
    }

    if (i < VectorCount)
    {
        const XMVECTOR row0 = M.r[0];
        const XMVECTOR row1 = M.r[1];
        const XMVECTOR row2 = M.r[2];
        const XMVECTOR row3 = M.r[3];

        for (; i < VectorCount; i++)
        {
            __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
            pInputVector += InputStride;

            XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
            XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
            XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
            XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));

            vTempX = _mm_mul_ps(vTempX, row0);
            vTempY = _mm_mul_ps(vTempY, row1);
            vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
            vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
            vTempX = _mm_add_ps(vTempZ, vTempW);

            _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
            pOutputVector += OutputStride;
        }
    }

    XM_SFENCE();

    return pOutputStream;
#elif defined(_XM_SSE_INTRINSICS_)
    auto pInputVector = reinterpret_cast<const uint8_t*>(pInputStream);
    auto pOutputVector = reinterpret_cast<uint8_t*>(pOutputStream);

    const XMVECTOR row0 = M.r[0];
    const XMVECTOR row1 = M.r[1];
    const XMVECTOR row2 = M.r[2];
    const XMVECTOR row3 = M.r[3];

    if (!(reinterpret_cast<uintptr_t>(pOutputStream) & 0xF) && !(OutputStride & 0xF))
    {
        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF))
        {
            // Aligned input, aligned output
            for (size_t i = 0; i < VectorCount; i++)
            {
                __m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
                pInputVector += InputStride;

                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));

                vTempX = _mm_mul_ps(vTempX, row0);
                vTempY = _mm_mul_ps(vTempY, row1);
                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
                vTempX = _mm_add_ps(vTempZ, vTempW);

                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
                pOutputVector += OutputStride;
            }
        }
        else
        {
            // Unaligned input, aligned output
            for (size_t i = 0; i < VectorCount; i++)
            {
                __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                pInputVector += InputStride;

                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));

                vTempX = _mm_mul_ps(vTempX, row0);
                vTempY = _mm_mul_ps(vTempY, row1);
                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
                vTempX = _mm_add_ps(vTempZ, vTempW);

                XM_STREAM_PS(reinterpret_cast<float*>(pOutputVector), vTempX);
                pOutputVector += OutputStride;
            }
        }
    }
    else
    {
        if (!(reinterpret_cast<uintptr_t>(pInputStream) & 0xF) && !(InputStride & 0xF))
        {
            // Aligned input, unaligned output
            for (size_t i = 0; i < VectorCount; i++)
            {
                __m128 V = _mm_load_ps(reinterpret_cast<const float*>(pInputVector));
                pInputVector += InputStride;

                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));

                vTempX = _mm_mul_ps(vTempX, row0);
                vTempY = _mm_mul_ps(vTempY, row1);
                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
                vTempX = _mm_add_ps(vTempZ, vTempW);

                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
                pOutputVector += OutputStride;
            }
        }
        else
        {
            // Unaligned input, unaligned output
            for (size_t i = 0; i < VectorCount; i++)
            {
                __m128 V = _mm_loadu_ps(reinterpret_cast<const float*>(pInputVector));
                pInputVector += InputStride;

                XMVECTOR vTempX = XM_PERMUTE_PS(V, _MM_SHUFFLE(0, 0, 0, 0));
                XMVECTOR vTempY = XM_PERMUTE_PS(V, _MM_SHUFFLE(1, 1, 1, 1));
                XMVECTOR vTempZ = XM_PERMUTE_PS(V, _MM_SHUFFLE(2, 2, 2, 2));
                XMVECTOR vTempW = XM_PERMUTE_PS(V, _MM_SHUFFLE(3, 3, 3, 3));

                vTempX = _mm_mul_ps(vTempX, row0);
                vTempY = _mm_mul_ps(vTempY, row1);
                vTempZ = XM_FMADD_PS(vTempZ, row2, vTempX);
                vTempW = XM_FMADD_PS(vTempW, row3, vTempY);
                vTempX = _mm_add_ps(vTempZ, vTempW);

                _mm_storeu_ps(reinterpret_cast<float*>(pOutputVector), vTempX);
                pOutputVector += OutputStride;
            }
        }
    }

    XM_SFENCE();

    return pOutputStream;
#endif
}

/****************************************************************************
 *
 * XMVECTOR operators
 *
 ****************************************************************************/

#ifndef _XM_NO_XMVECTOR_OVERLOADS_

 //------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator+ (FXMVECTOR V) noexcept
{
    return V;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator- (FXMVECTOR V) noexcept
{
    return XMVectorNegate(V);
}

//------------------------------------------------------------------------------

inline XMVECTOR& XM_CALLCONV operator+=
(
    XMVECTOR& V1,
    FXMVECTOR       V2
) noexcept
{
    V1 = XMVectorAdd(V1, V2);
    return V1;
}

//------------------------------------------------------------------------------

inline XMVECTOR& XM_CALLCONV operator-=
(
    XMVECTOR& V1,
    FXMVECTOR       V2
) noexcept
{
    V1 = XMVectorSubtract(V1, V2);
    return V1;
}

//------------------------------------------------------------------------------

inline XMVECTOR& XM_CALLCONV operator*=
(
    XMVECTOR& V1,
    FXMVECTOR       V2
) noexcept
{
    V1 = XMVectorMultiply(V1, V2);
    return V1;
}

//------------------------------------------------------------------------------

inline XMVECTOR& XM_CALLCONV operator/=
(
    XMVECTOR& V1,
    FXMVECTOR       V2
) noexcept
{
    V1 = XMVectorDivide(V1, V2);
    return V1;
}

//------------------------------------------------------------------------------

inline XMVECTOR& operator*=
(
    XMVECTOR& V,
    const float S
) noexcept
{
    V = XMVectorScale(V, S);
    return V;
}

//------------------------------------------------------------------------------

inline XMVECTOR& operator/=
(
    XMVECTOR& V,
    const float S
) noexcept
{
    XMVECTOR vS = XMVectorReplicate(S);
    V = XMVectorDivide(V, vS);
    return V;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator+
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    return XMVectorAdd(V1, V2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator-
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    return XMVectorSubtract(V1, V2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator*
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    return XMVectorMultiply(V1, V2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator/
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    return XMVectorDivide(V1, V2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator*
(
    FXMVECTOR      V,
    const float    S
) noexcept
{
    return XMVectorScale(V, S);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator/
(
    FXMVECTOR      V,
    const float    S
) noexcept
{
    XMVECTOR vS = XMVectorReplicate(S);
    return XMVectorDivide(V, vS);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV operator*
(
    float           S,
    FXMVECTOR       V
) noexcept
{
    return XMVectorScale(V, S);
}

#endif /* !_XM_NO_XMVECTOR_OVERLOADS_ */

#if defined(_XM_NO_INTRINSICS_)
#undef XMISNAN
#undef XMISINF
#endif

#if defined(_XM_SSE_INTRINSICS_)
#undef XM3UNPACK3INTO4
#undef XM3PACK4INTO3
#endif

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(push)
#pragma float_control(precise, on)
#endif

// Return true if any entry in the matrix is NaN
inline bool XM_CALLCONV XMMatrixIsNaN(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    size_t i = 16;
    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
    do {
        // Fetch value into integer unit
        uint32_t uTest = pWork[0];
        // Remove sign
        uTest &= 0x7FFFFFFFU;
        // NaN is 0x7F800001 through 0x7FFFFFFF inclusive
        uTest -= 0x7F800001U;
        if (uTest < 0x007FFFFFU)
        {
            break;      // NaN found
        }
        ++pWork;        // Next entry
    } while (--i);
    return (i != 0);      // i == 0 if nothing matched
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Load in registers
    float32x4_t vX = M.r[0];
    float32x4_t vY = M.r[1];
    float32x4_t vZ = M.r[2];
    float32x4_t vW = M.r[3];
    // Test themselves to check for NaN
    uint32x4_t xmask = vmvnq_u32(vceqq_f32(vX, vX));
    uint32x4_t ymask = vmvnq_u32(vceqq_f32(vY, vY));
    uint32x4_t zmask = vmvnq_u32(vceqq_f32(vZ, vZ));
    uint32x4_t wmask = vmvnq_u32(vceqq_f32(vW, vW));
    // Or all the results
    xmask = vorrq_u32(xmask, zmask);
    ymask = vorrq_u32(ymask, wmask);
    xmask = vorrq_u32(xmask, ymask);
    // If any tested true, return true
    uint8x8x2_t vTemp = vzip_u8(
        vget_low_u8(vreinterpretq_u8_u32(xmask)),
        vget_high_u8(vreinterpretq_u8_u32(xmask)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r != 0);
#elif defined(_XM_SSE_INTRINSICS_)
    // Load in registers
    XMVECTOR vX = M.r[0];
    XMVECTOR vY = M.r[1];
    XMVECTOR vZ = M.r[2];
    XMVECTOR vW = M.r[3];
    // Test themselves to check for NaN
    vX = _mm_cmpneq_ps(vX, vX);
    vY = _mm_cmpneq_ps(vY, vY);
    vZ = _mm_cmpneq_ps(vZ, vZ);
    vW = _mm_cmpneq_ps(vW, vW);
    // Or all the results
    vX = _mm_or_ps(vX, vZ);
    vY = _mm_or_ps(vY, vW);
    vX = _mm_or_ps(vX, vY);
    // If any tested true, return true
    return (_mm_movemask_ps(vX) != 0);
#else
#endif
}

#if !defined(_XM_NO_INTRINSICS_) && defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma float_control(pop)
#endif

//------------------------------------------------------------------------------

// Return true if any entry in the matrix is +/-INF
inline bool XM_CALLCONV XMMatrixIsInfinite(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    size_t i = 16;
    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
    do {
        // Fetch value into integer unit
        uint32_t uTest = pWork[0];
        // Remove sign
        uTest &= 0x7FFFFFFFU;
        // INF is 0x7F800000
        if (uTest == 0x7F800000U)
        {
            break;      // INF found
        }
        ++pWork;        // Next entry
    } while (--i);
    return (i != 0);      // i == 0 if nothing matched
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    // Load in registers
    float32x4_t vX = M.r[0];
    float32x4_t vY = M.r[1];
    float32x4_t vZ = M.r[2];
    float32x4_t vW = M.r[3];
    // Mask off the sign bits
    vX = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vX), g_XMAbsMask));
    vY = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vY), g_XMAbsMask));
    vZ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vZ), g_XMAbsMask));
    vW = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(vW), g_XMAbsMask));
    // Compare to infinity
    uint32x4_t xmask = vceqq_f32(vX, g_XMInfinity);
    uint32x4_t ymask = vceqq_f32(vY, g_XMInfinity);
    uint32x4_t zmask = vceqq_f32(vZ, g_XMInfinity);
    uint32x4_t wmask = vceqq_f32(vW, g_XMInfinity);
    // Or the answers together
    xmask = vorrq_u32(xmask, zmask);
    ymask = vorrq_u32(ymask, wmask);
    xmask = vorrq_u32(xmask, ymask);
    // If any tested true, return true
    uint8x8x2_t vTemp = vzip_u8(
        vget_low_u8(vreinterpretq_u8_u32(xmask)),
        vget_high_u8(vreinterpretq_u8_u32(xmask)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r != 0);
#elif defined(_XM_SSE_INTRINSICS_)
    // Mask off the sign bits
    XMVECTOR vTemp1 = _mm_and_ps(M.r[0], g_XMAbsMask);
    XMVECTOR vTemp2 = _mm_and_ps(M.r[1], g_XMAbsMask);
    XMVECTOR vTemp3 = _mm_and_ps(M.r[2], g_XMAbsMask);
    XMVECTOR vTemp4 = _mm_and_ps(M.r[3], g_XMAbsMask);
    // Compare to infinity
    vTemp1 = _mm_cmpeq_ps(vTemp1, g_XMInfinity);
    vTemp2 = _mm_cmpeq_ps(vTemp2, g_XMInfinity);
    vTemp3 = _mm_cmpeq_ps(vTemp3, g_XMInfinity);
    vTemp4 = _mm_cmpeq_ps(vTemp4, g_XMInfinity);
    // Or the answers together
    vTemp1 = _mm_or_ps(vTemp1, vTemp2);
    vTemp3 = _mm_or_ps(vTemp3, vTemp4);
    vTemp1 = _mm_or_ps(vTemp1, vTemp3);
    // If any are infinity, the signs are true.
    return (_mm_movemask_ps(vTemp1) != 0);
#endif
}

//------------------------------------------------------------------------------

// Return true if the XMMatrix is equal to identity
inline bool XM_CALLCONV XMMatrixIsIdentity(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    // Use the integer pipeline to reduce branching to a minimum
    auto pWork = reinterpret_cast<const uint32_t*>(&M.m[0][0]);
    // Convert 1.0f to zero and or them together
    uint32_t uOne = pWork[0] ^ 0x3F800000U;
    // Or all the 0.0f entries together
    uint32_t uZero = pWork[1];
    uZero |= pWork[2];
    uZero |= pWork[3];
    // 2nd row
    uZero |= pWork[4];
    uOne |= pWork[5] ^ 0x3F800000U;
    uZero |= pWork[6];
    uZero |= pWork[7];
    // 3rd row
    uZero |= pWork[8];
    uZero |= pWork[9];
    uOne |= pWork[10] ^ 0x3F800000U;
    uZero |= pWork[11];
    // 4th row
    uZero |= pWork[12];
    uZero |= pWork[13];
    uZero |= pWork[14];
    uOne |= pWork[15] ^ 0x3F800000U;
    // If all zero entries are zero, the uZero==0
    uZero &= 0x7FFFFFFF;    // Allow -0.0f
    // If all 1.0f entries are 1.0f, then uOne==0
    uOne |= uZero;
    return (uOne == 0);
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t xmask = vceqq_f32(M.r[0], g_XMIdentityR0);
    uint32x4_t ymask = vceqq_f32(M.r[1], g_XMIdentityR1);
    uint32x4_t zmask = vceqq_f32(M.r[2], g_XMIdentityR2);
    uint32x4_t wmask = vceqq_f32(M.r[3], g_XMIdentityR3);
    xmask = vandq_u32(xmask, zmask);
    ymask = vandq_u32(ymask, wmask);
    xmask = vandq_u32(xmask, ymask);
    uint8x8x2_t vTemp = vzip_u8(vget_low_u8(vreinterpretq_u8_u32(xmask)), vget_high_u8(vreinterpretq_u8_u32(xmask)));
    uint16x4x2_t vTemp2 = vzip_u16(vreinterpret_u16_u8(vTemp.val[0]), vreinterpret_u16_u8(vTemp.val[1]));
    uint32_t r = vget_lane_u32(vreinterpret_u32_u16(vTemp2.val[1]), 1);
    return (r == 0xFFFFFFFFU);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0], g_XMIdentityR0);
    XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1], g_XMIdentityR1);
    XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2], g_XMIdentityR2);
    XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3], g_XMIdentityR3);
    vTemp1 = _mm_and_ps(vTemp1, vTemp2);
    vTemp3 = _mm_and_ps(vTemp3, vTemp4);
    vTemp1 = _mm_and_ps(vTemp1, vTemp3);
    return (_mm_movemask_ps(vTemp1) == 0x0f);
#endif
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// Perform a 4x4 matrix multiply by a 4x4 matrix
inline XMMATRIX XM_CALLCONV XMMatrixMultiply
(
    FXMMATRIX M1,
    CXMMATRIX M2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMMATRIX mResult;
    // Cache the invariants in registers
    float x = M1.m[0][0];
    float y = M1.m[0][1];
    float z = M1.m[0][2];
    float w = M1.m[0][3];
    // Perform the operation on the first row
    mResult.m[0][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
    mResult.m[0][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
    mResult.m[0][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
    mResult.m[0][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
    // Repeat for all the other rows
    x = M1.m[1][0];
    y = M1.m[1][1];
    z = M1.m[1][2];
    w = M1.m[1][3];
    mResult.m[1][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
    mResult.m[1][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
    mResult.m[1][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
    mResult.m[1][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
    x = M1.m[2][0];
    y = M1.m[2][1];
    z = M1.m[2][2];
    w = M1.m[2][3];
    mResult.m[2][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
    mResult.m[2][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
    mResult.m[2][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
    mResult.m[2][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
    x = M1.m[3][0];
    y = M1.m[3][1];
    z = M1.m[3][2];
    w = M1.m[3][3];
    mResult.m[3][0] = (M2.m[0][0] * x) + (M2.m[1][0] * y) + (M2.m[2][0] * z) + (M2.m[3][0] * w);
    mResult.m[3][1] = (M2.m[0][1] * x) + (M2.m[1][1] * y) + (M2.m[2][1] * z) + (M2.m[3][1] * w);
    mResult.m[3][2] = (M2.m[0][2] * x) + (M2.m[1][2] * y) + (M2.m[2][2] * z) + (M2.m[3][2] * w);
    mResult.m[3][3] = (M2.m[0][3] * x) + (M2.m[1][3] * y) + (M2.m[2][3] * z) + (M2.m[3][3] * w);
    return mResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX mResult;
    float32x2_t VL = vget_low_f32(M1.r[0]);
    float32x2_t VH = vget_high_f32(M1.r[0]);
    // Perform the operation on the first row
    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    mResult.r[0] = vaddq_f32(vZ, vW);
    // Repeat for the other 3 rows
    VL = vget_low_f32(M1.r[1]);
    VH = vget_high_f32(M1.r[1]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    mResult.r[1] = vaddq_f32(vZ, vW);
    VL = vget_low_f32(M1.r[2]);
    VH = vget_high_f32(M1.r[2]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    mResult.r[2] = vaddq_f32(vZ, vW);
    VL = vget_low_f32(M1.r[3]);
    VH = vget_high_f32(M1.r[3]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    mResult.r[3] = vaddq_f32(vZ, vW);
    return mResult;
#elif defined(_XM_AVX2_INTRINSICS_)
    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);

    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);

    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
    __m256 c0 = _mm256_mul_ps(a0, b0);
    __m256 c1 = _mm256_mul_ps(a1, b0);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
    __m256 c4 = _mm256_mul_ps(a0, b1);
    __m256 c5 = _mm256_mul_ps(a1, b1);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);

    t0 = _mm256_add_ps(c2, c6);
    t1 = _mm256_add_ps(c3, c7);

    XMMATRIX mResult;
    mResult.r[0] = _mm256_castps256_ps128(t0);
    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
    mResult.r[2] = _mm256_castps256_ps128(t1);
    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
    return mResult;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX mResult;
    // Splat the component X,Y,Z then W
#if defined(_XM_AVX_INTRINSICS_)
    XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
    XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
    XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
    XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
#else
    // Use vW to hold the original row
    XMVECTOR vW = M1.r[0];
    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    // Perform the operation on the first row
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    // Perform a binary add to reduce cumulative errors
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    mResult.r[0] = vX;
    // Repeat for the other 3 rows
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
#else
    vW = M1.r[1];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    mResult.r[1] = vX;
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
#else
    vW = M1.r[2];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    mResult.r[2] = vX;
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
#else
    vW = M1.r[3];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    mResult.r[3] = vX;
    return mResult;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixMultiplyTranspose
(
    FXMMATRIX M1,
    CXMMATRIX M2
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMMATRIX mResult;
    // Cache the invariants in registers
    float x = M2.m[0][0];
    float y = M2.m[1][0];
    float z = M2.m[2][0];
    float w = M2.m[3][0];
    // Perform the operation on the first row
    mResult.m[0][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
    mResult.m[0][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
    mResult.m[0][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
    mResult.m[0][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
    // Repeat for all the other rows
    x = M2.m[0][1];
    y = M2.m[1][1];
    z = M2.m[2][1];
    w = M2.m[3][1];
    mResult.m[1][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
    mResult.m[1][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
    mResult.m[1][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
    mResult.m[1][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
    x = M2.m[0][2];
    y = M2.m[1][2];
    z = M2.m[2][2];
    w = M2.m[3][2];
    mResult.m[2][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
    mResult.m[2][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
    mResult.m[2][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
    mResult.m[2][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
    x = M2.m[0][3];
    y = M2.m[1][3];
    z = M2.m[2][3];
    w = M2.m[3][3];
    mResult.m[3][0] = (M1.m[0][0] * x) + (M1.m[0][1] * y) + (M1.m[0][2] * z) + (M1.m[0][3] * w);
    mResult.m[3][1] = (M1.m[1][0] * x) + (M1.m[1][1] * y) + (M1.m[1][2] * z) + (M1.m[1][3] * w);
    mResult.m[3][2] = (M1.m[2][0] * x) + (M1.m[2][1] * y) + (M1.m[2][2] * z) + (M1.m[2][3] * w);
    mResult.m[3][3] = (M1.m[3][0] * x) + (M1.m[3][1] * y) + (M1.m[3][2] * z) + (M1.m[3][3] * w);
    return mResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x2_t VL = vget_low_f32(M1.r[0]);
    float32x2_t VH = vget_high_f32(M1.r[0]);
    // Perform the operation on the first row
    float32x4_t vX = vmulq_lane_f32(M2.r[0], VL, 0);
    float32x4_t vY = vmulq_lane_f32(M2.r[1], VL, 1);
    float32x4_t vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    float32x4_t vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    float32x4_t r0 = vaddq_f32(vZ, vW);
    // Repeat for the other 3 rows
    VL = vget_low_f32(M1.r[1]);
    VH = vget_high_f32(M1.r[1]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    float32x4_t r1 = vaddq_f32(vZ, vW);
    VL = vget_low_f32(M1.r[2]);
    VH = vget_high_f32(M1.r[2]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    float32x4_t r2 = vaddq_f32(vZ, vW);
    VL = vget_low_f32(M1.r[3]);
    VH = vget_high_f32(M1.r[3]);
    vX = vmulq_lane_f32(M2.r[0], VL, 0);
    vY = vmulq_lane_f32(M2.r[1], VL, 1);
    vZ = vmlaq_lane_f32(vX, M2.r[2], VH, 0);
    vW = vmlaq_lane_f32(vY, M2.r[3], VH, 1);
    float32x4_t r3 = vaddq_f32(vZ, vW);

    // Transpose result
    float32x4x2_t P0 = vzipq_f32(r0, r2);
    float32x4x2_t P1 = vzipq_f32(r1, r3);

    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

    XMMATRIX mResult;
    mResult.r[0] = T0.val[0];
    mResult.r[1] = T0.val[1];
    mResult.r[2] = T1.val[0];
    mResult.r[3] = T1.val[1];
    return mResult;
#elif defined(_XM_AVX2_INTRINSICS_)
    __m256 t0 = _mm256_castps128_ps256(M1.r[0]);
    t0 = _mm256_insertf128_ps(t0, M1.r[1], 1);
    __m256 t1 = _mm256_castps128_ps256(M1.r[2]);
    t1 = _mm256_insertf128_ps(t1, M1.r[3], 1);

    __m256 u0 = _mm256_castps128_ps256(M2.r[0]);
    u0 = _mm256_insertf128_ps(u0, M2.r[1], 1);
    __m256 u1 = _mm256_castps128_ps256(M2.r[2]);
    u1 = _mm256_insertf128_ps(u1, M2.r[3], 1);

    __m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
    __m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
    __m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
    __m256 c0 = _mm256_mul_ps(a0, b0);
    __m256 c1 = _mm256_mul_ps(a1, b0);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
    b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
    __m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
    __m256 c3 = _mm256_fmadd_ps(a1, b0, c1);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
    __m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
    __m256 c4 = _mm256_mul_ps(a0, b1);
    __m256 c5 = _mm256_mul_ps(a1, b1);

    a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
    a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
    b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
    __m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
    __m256 c7 = _mm256_fmadd_ps(a1, b1, c5);

    t0 = _mm256_add_ps(c2, c6);
    t1 = _mm256_add_ps(c3, c7);

    // Transpose result
    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);

    XMMATRIX mResult;
    mResult.r[0] = _mm256_castps256_ps128(t0);
    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
    mResult.r[2] = _mm256_castps256_ps128(t1);
    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
    return mResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Splat the component X,Y,Z then W
#if defined(_XM_AVX_INTRINSICS_)
    XMVECTOR vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 0);
    XMVECTOR vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 1);
    XMVECTOR vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 2);
    XMVECTOR vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[0]) + 3);
#else
    // Use vW to hold the original row
    XMVECTOR vW = M1.r[0];
    XMVECTOR vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    XMVECTOR vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    XMVECTOR vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    // Perform the operation on the first row
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    // Perform a binary add to reduce cumulative errors
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    XMVECTOR r0 = vX;
    // Repeat for the other 3 rows
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[1]) + 3);
#else
    vW = M1.r[1];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    XMVECTOR r1 = vX;
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[2]) + 3);
#else
    vW = M1.r[2];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    XMVECTOR r2 = vX;
#if defined(_XM_AVX_INTRINSICS_)
    vX = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 0);
    vY = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 1);
    vZ = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 2);
    vW = _mm_broadcast_ss(reinterpret_cast<const float*>(&M1.r[3]) + 3);
#else
    vW = M1.r[3];
    vX = XM_PERMUTE_PS(vW, _MM_SHUFFLE(0, 0, 0, 0));
    vY = XM_PERMUTE_PS(vW, _MM_SHUFFLE(1, 1, 1, 1));
    vZ = XM_PERMUTE_PS(vW, _MM_SHUFFLE(2, 2, 2, 2));
    vW = XM_PERMUTE_PS(vW, _MM_SHUFFLE(3, 3, 3, 3));
#endif
    vX = _mm_mul_ps(vX, M2.r[0]);
    vY = _mm_mul_ps(vY, M2.r[1]);
    vZ = _mm_mul_ps(vZ, M2.r[2]);
    vW = _mm_mul_ps(vW, M2.r[3]);
    vX = _mm_add_ps(vX, vZ);
    vY = _mm_add_ps(vY, vW);
    vX = _mm_add_ps(vX, vY);
    XMVECTOR r3 = vX;

    // Transpose result
    // x.x,x.y,y.x,y.y
    XMVECTOR vTemp1 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
    // x.z,x.w,y.z,y.w
    XMVECTOR vTemp3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 2, 3, 2));
    // z.x,z.y,w.x,w.y
    XMVECTOR vTemp2 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(1, 0, 1, 0));
    // z.z,z.w,w.z,w.w
    XMVECTOR vTemp4 = _mm_shuffle_ps(r2, r3, _MM_SHUFFLE(3, 2, 3, 2));

    XMMATRIX mResult;
    // x.x,y.x,z.x,w.x
    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
    // x.y,y.y,z.y,w.y
    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
    // x.z,y.z,z.z,w.z
    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
    // x.w,y.w,z.w,w.w
    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
    return mResult;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixTranspose(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    // Original matrix:
    //
    //     m00m01m02m03
    //     m10m11m12m13
    //     m20m21m22m23
    //     m30m31m32m33

    XMMATRIX P;
    P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21
    P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31
    P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23
    P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33

    XMMATRIX MT;
    MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30
    MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31
    MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32
    MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33
    return MT;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4x2_t P0 = vzipq_f32(M.r[0], M.r[2]);
    float32x4x2_t P1 = vzipq_f32(M.r[1], M.r[3]);

    float32x4x2_t T0 = vzipq_f32(P0.val[0], P1.val[0]);
    float32x4x2_t T1 = vzipq_f32(P0.val[1], P1.val[1]);

    XMMATRIX mResult;
    mResult.r[0] = T0.val[0];
    mResult.r[1] = T0.val[1];
    mResult.r[2] = T1.val[0];
    mResult.r[3] = T1.val[1];
    return mResult;
#elif defined(_XM_AVX2_INTRINSICS_)
    __m256 t0 = _mm256_castps128_ps256(M.r[0]);
    t0 = _mm256_insertf128_ps(t0, M.r[1], 1);
    __m256 t1 = _mm256_castps128_ps256(M.r[2]);
    t1 = _mm256_insertf128_ps(t1, M.r[3], 1);

    __m256 vTemp = _mm256_unpacklo_ps(t0, t1);
    __m256 vTemp2 = _mm256_unpackhi_ps(t0, t1);
    __m256 vTemp3 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
    __m256 vTemp4 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);
    vTemp = _mm256_unpacklo_ps(vTemp3, vTemp4);
    vTemp2 = _mm256_unpackhi_ps(vTemp3, vTemp4);
    t0 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x20);
    t1 = _mm256_permute2f128_ps(vTemp, vTemp2, 0x31);

    XMMATRIX mResult;
    mResult.r[0] = _mm256_castps256_ps128(t0);
    mResult.r[1] = _mm256_extractf128_ps(t0, 1);
    mResult.r[2] = _mm256_castps256_ps128(t1);
    mResult.r[3] = _mm256_extractf128_ps(t1, 1);
    return mResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // x.x,x.y,y.x,y.y
    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
    // x.z,x.w,y.z,y.w
    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
    // z.x,z.y,w.x,w.y
    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
    // z.z,z.w,w.z,w.w
    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));

    XMMATRIX mResult;
    // x.x,y.x,z.x,w.x
    mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
    // x.y,y.y,z.y,w.y
    mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
    // x.z,y.z,z.z,w.z
    mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
    // x.w,y.w,z.w,w.w
    mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));
    return mResult;
#endif
}

//------------------------------------------------------------------------------
// Return the inverse and the determinant of a 4x4 matrix

inline XMMATRIX XM_CALLCONV XMMatrixInverse
(
    XMVECTOR* pDeterminant,
    FXMMATRIX  M
) noexcept
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    XMMATRIX MT = XMMatrixTranspose(M);

    XMVECTOR V0[4], V1[4];
    V0[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[2]);
    V1[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[3]);
    V0[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[0]);
    V1[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[1]);
    V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[2], MT.r[0]);
    V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[3], MT.r[1]);

    XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]);
    XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]);
    XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]);

    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[2]);
    V1[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[3]);
    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[0]);
    V1[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[1]);
    V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[2], MT.r[0]);
    V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[3], MT.r[1]);

    D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0);
    D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1);
    D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2);

    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[1]);
    V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D0, D2);
    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[0]);
    V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D0, D2);
    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[3]);
    V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D1, D2);
    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[2]);
    V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D1, D2);

    XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]);
    XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]);
    XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]);
    XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]);

    V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[1]);
    V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(D0, D2);
    V0[1] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[0]);
    V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X>(D0, D2);
    V0[2] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[3]);
    V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z>(D1, D2);
    V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]);
    V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2);

    C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
    C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);

    V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]);
    V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2);
    V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]);
    V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2);
    V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]);
    V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2);
    V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]);
    V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2);

    XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
    C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
    XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
    C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
    XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
    C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
    XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
    C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);

    XMMATRIX R;
    R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
    R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
    R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
    R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);

    XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);

    if (pDeterminant != nullptr)
        *pDeterminant = Determinant;

    XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);

    XMMATRIX Result;
    Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
    Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
    Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
    Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    // Transpose matrix
    XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(1, 0, 1, 0));
    XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0], M.r[1], _MM_SHUFFLE(3, 2, 3, 2));
    XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(1, 0, 1, 0));
    XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2], M.r[3], _MM_SHUFFLE(3, 2, 3, 2));

    XMMATRIX MT;
    MT.r[0] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0));
    MT.r[1] = _mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1));
    MT.r[2] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0));
    MT.r[3] = _mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1));

    XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 1, 0, 0));
    XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(3, 2, 3, 2));
    XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 1, 0, 0));
    XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(3, 2, 3, 2));
    XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(2, 0, 2, 0));
    XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(3, 1, 3, 1));

    XMVECTOR D0 = _mm_mul_ps(V00, V10);
    XMVECTOR D1 = _mm_mul_ps(V01, V11);
    XMVECTOR D2 = _mm_mul_ps(V02, V12);

    V00 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(3, 2, 3, 2));
    V10 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 1, 0, 0));
    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(3, 2, 3, 2));
    V11 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 1, 0, 0));
    V02 = _mm_shuffle_ps(MT.r[2], MT.r[0], _MM_SHUFFLE(3, 1, 3, 1));
    V12 = _mm_shuffle_ps(MT.r[3], MT.r[1], _MM_SHUFFLE(2, 0, 2, 0));

    D0 = XM_FNMADD_PS(V00, V10, D0);
    D1 = XM_FNMADD_PS(V01, V11, D1);
    D2 = XM_FNMADD_PS(V02, V12, D2);
    // V11 = D0Y,D0W,D2Y,D2Y
    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 1, 3, 1));
    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1, 0, 2, 1));
    V10 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(0, 3, 0, 2));
    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0, 1, 0, 2));
    V11 = _mm_shuffle_ps(V11, D0, _MM_SHUFFLE(2, 1, 2, 1));
    // V13 = D1Y,D1W,D2W,D2W
    XMVECTOR V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 3, 3, 1));
    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1, 0, 2, 1));
    V12 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(0, 3, 0, 2));
    XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(0, 1, 0, 2));
    V13 = _mm_shuffle_ps(V13, D1, _MM_SHUFFLE(2, 1, 2, 1));

    XMVECTOR C0 = _mm_mul_ps(V00, V10);
    XMVECTOR C2 = _mm_mul_ps(V01, V11);
    XMVECTOR C4 = _mm_mul_ps(V02, V12);
    XMVECTOR C6 = _mm_mul_ps(V03, V13);

    // V11 = D0X,D0Y,D2X,D2X
    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(0, 0, 1, 0));
    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2, 1, 3, 2));
    V10 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(2, 1, 0, 3));
    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1, 3, 2, 3));
    V11 = _mm_shuffle_ps(D0, V11, _MM_SHUFFLE(0, 2, 1, 2));
    // V13 = D1X,D1Y,D2Z,D2Z
    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(2, 2, 1, 0));
    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2, 1, 3, 2));
    V12 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(2, 1, 0, 3));
    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(1, 3, 2, 3));
    V13 = _mm_shuffle_ps(D1, V13, _MM_SHUFFLE(0, 2, 1, 2));

    C0 = XM_FNMADD_PS(V00, V10, C0);
    C2 = XM_FNMADD_PS(V01, V11, C2);
    C4 = XM_FNMADD_PS(V02, V12, C4);
    C6 = XM_FNMADD_PS(V03, V13, C6);

    V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(0, 3, 0, 3));
    // V10 = D0Z,D0Z,D2X,D2Y
    V10 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 2, 2));
    V10 = XM_PERMUTE_PS(V10, _MM_SHUFFLE(0, 2, 3, 0));
    V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(2, 0, 3, 1));
    // V11 = D0X,D0W,D2X,D2Y
    V11 = _mm_shuffle_ps(D0, D2, _MM_SHUFFLE(1, 0, 3, 0));
    V11 = XM_PERMUTE_PS(V11, _MM_SHUFFLE(2, 1, 0, 3));
    V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(0, 3, 0, 3));
    // V12 = D1Z,D1Z,D2Z,D2W
    V12 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 2, 2));
    V12 = XM_PERMUTE_PS(V12, _MM_SHUFFLE(0, 2, 3, 0));
    V03 = XM_PERMUTE_PS(MT.r[2], _MM_SHUFFLE(2, 0, 3, 1));
    // V13 = D1X,D1W,D2Z,D2W
    V13 = _mm_shuffle_ps(D1, D2, _MM_SHUFFLE(3, 2, 3, 0));
    V13 = XM_PERMUTE_PS(V13, _MM_SHUFFLE(2, 1, 0, 3));

    V00 = _mm_mul_ps(V00, V10);
    V01 = _mm_mul_ps(V01, V11);
    V02 = _mm_mul_ps(V02, V12);
    V03 = _mm_mul_ps(V03, V13);
    XMVECTOR C1 = _mm_sub_ps(C0, V00);
    C0 = _mm_add_ps(C0, V00);
    XMVECTOR C3 = _mm_add_ps(C2, V01);
    C2 = _mm_sub_ps(C2, V01);
    XMVECTOR C5 = _mm_sub_ps(C4, V02);
    C4 = _mm_add_ps(C4, V02);
    XMVECTOR C7 = _mm_add_ps(C6, V03);
    C6 = _mm_sub_ps(C6, V03);

    C0 = _mm_shuffle_ps(C0, C1, _MM_SHUFFLE(3, 1, 2, 0));
    C2 = _mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 2, 0));
    C4 = _mm_shuffle_ps(C4, C5, _MM_SHUFFLE(3, 1, 2, 0));
    C6 = _mm_shuffle_ps(C6, C7, _MM_SHUFFLE(3, 1, 2, 0));
    C0 = XM_PERMUTE_PS(C0, _MM_SHUFFLE(3, 1, 2, 0));
    C2 = XM_PERMUTE_PS(C2, _MM_SHUFFLE(3, 1, 2, 0));
    C4 = XM_PERMUTE_PS(C4, _MM_SHUFFLE(3, 1, 2, 0));
    C6 = XM_PERMUTE_PS(C6, _MM_SHUFFLE(3, 1, 2, 0));
    // Get the determinant
    XMVECTOR vTemp = XMVector4Dot(C0, MT.r[0]);
    if (pDeterminant != nullptr)
        *pDeterminant = vTemp;
    vTemp = _mm_div_ps(g_XMOne, vTemp);
    XMMATRIX mResult;
    mResult.r[0] = _mm_mul_ps(C0, vTemp);
    mResult.r[1] = _mm_mul_ps(C2, vTemp);
    mResult.r[2] = _mm_mul_ps(C4, vTemp);
    mResult.r[3] = _mm_mul_ps(C6, vTemp);
    return mResult;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixVectorTensorProduct
(
    FXMVECTOR V1,
    FXMVECTOR V2
) noexcept
{
    XMMATRIX mResult;
    mResult.r[0] = XMVectorMultiply(XMVectorSwizzle<0, 0, 0, 0>(V1), V2);
    mResult.r[1] = XMVectorMultiply(XMVectorSwizzle<1, 1, 1, 1>(V1), V2);
    mResult.r[2] = XMVectorMultiply(XMVectorSwizzle<2, 2, 2, 2>(V1), V2);
    mResult.r[3] = XMVectorMultiply(XMVectorSwizzle<3, 3, 3, 3>(V1), V2);
    return mResult;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMMatrixDeterminant(FXMMATRIX M) noexcept
{
    static const XMVECTORF32 Sign = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };

    XMVECTOR V0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
    XMVECTOR V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);
    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]);
    XMVECTOR V3 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);
    XMVECTOR V4 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
    XMVECTOR V5 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]);

    XMVECTOR P0 = XMVectorMultiply(V0, V1);
    XMVECTOR P1 = XMVectorMultiply(V2, V3);
    XMVECTOR P2 = XMVectorMultiply(V4, V5);

    V0 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]);
    V1 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
    V2 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
    V3 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]);
    V4 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]);
    V5 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]);

    P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0);
    P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1);
    P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2);

    V0 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[1]);
    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[1]);
    V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[1]);

    XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v);
    XMVECTOR R = XMVectorMultiply(V0, P0);
    R = XMVectorNegativeMultiplySubtract(V1, P1, R);
    R = XMVectorMultiplyAdd(V2, P2, R);

    return XMVector4Dot(S, R);
}

#define XM3RANKDECOMPOSE(a, b, c, x, y, z)      \
    if((x) < (y))                   \
    {                               \
        if((y) < (z))               \
        {                           \
            (a) = 2;                \
            (b) = 1;                \
            (c) = 0;                \
        }                           \
        else                        \
        {                           \
            (a) = 1;                \
                                    \
            if((x) < (z))           \
            {                       \
                (b) = 2;            \
                (c) = 0;            \
            }                       \
            else                    \
            {                       \
                (b) = 0;            \
                (c) = 2;            \
            }                       \
        }                           \
    }                               \
    else                            \
    {                               \
        if((x) < (z))               \
        {                           \
            (a) = 2;                \
            (b) = 0;                \
            (c) = 1;                \
        }                           \
        else                        \
        {                           \
            (a) = 0;                \
                                    \
            if((y) < (z))           \
            {                       \
                (b) = 2;            \
                (c) = 1;            \
            }                       \
            else                    \
            {                       \
                (b) = 1;            \
                (c) = 2;            \
            }                       \
        }                           \
    }

#define XM3_DECOMP_EPSILON 0.0001f


inline bool XM_CALLCONV XMMatrixDecompose
(
    XMVECTOR* outScale,
    XMVECTOR* outRotQuat,
    XMVECTOR* outTrans,
    FXMMATRIX M
) noexcept
{
    static const XMVECTOR* pvCanonicalBasis[3] = {
        &g_XMIdentityR0.v,
        &g_XMIdentityR1.v,
        &g_XMIdentityR2.v
    };

    assert(outScale != nullptr);
    assert(outRotQuat != nullptr);
    assert(outTrans != nullptr);

    // Get the translation
    outTrans[0] = M.r[3];

    XMVECTOR* ppvBasis[3];
    XMMATRIX matTemp;
    ppvBasis[0] = &matTemp.r[0];
    ppvBasis[1] = &matTemp.r[1];
    ppvBasis[2] = &matTemp.r[2];

    matTemp.r[0] = M.r[0];
    matTemp.r[1] = M.r[1];
    matTemp.r[2] = M.r[2];
    matTemp.r[3] = g_XMIdentityR3.v;

    auto pfScales = reinterpret_cast<float*>(outScale);

    size_t a, b, c;
    XMVectorGetXPtr(&pfScales[0], XMVector3Length(ppvBasis[0][0]));
    XMVectorGetXPtr(&pfScales[1], XMVector3Length(ppvBasis[1][0]));
    XMVectorGetXPtr(&pfScales[2], XMVector3Length(ppvBasis[2][0]));
    pfScales[3] = 0.f;

    XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2])

        if (pfScales[a] < XM3_DECOMP_EPSILON)
        {
            ppvBasis[a][0] = pvCanonicalBasis[a][0];
        }
    ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]);

    if (pfScales[b] < XM3_DECOMP_EPSILON)
    {
        size_t aa, bb, cc;
        float fAbsX, fAbsY, fAbsZ;

        fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0]));
        fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0]));
        fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0]));

        XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ)

            ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0], pvCanonicalBasis[cc][0]);
    }

    ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]);

    if (pfScales[c] < XM3_DECOMP_EPSILON)
    {
        ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0], ppvBasis[b][0]);
    }

    ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]);

    float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp));

    // use Kramer's rule to check for handedness of coordinate system
    if (fDet < 0.0f)
    {
        // switch coordinate system by negating the scale and inverting the basis vector on the x-axis
        pfScales[a] = -pfScales[a];
        ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]);

        fDet = -fDet;
    }

    fDet -= 1.0f;
    fDet *= fDet;

    if (XM3_DECOMP_EPSILON < fDet)
    {
        // Non-SRT matrix encountered
        return false;
    }

    // generate the quaternion from the matrix
    outRotQuat[0] = XMQuaternionRotationMatrix(matTemp);
    return true;
}

#undef XM3_DECOMP_EPSILON
#undef XM3RANKDECOMPOSE

//------------------------------------------------------------------------------
// Transformation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixIdentity() noexcept
{
    XMMATRIX M;
    M.r[0] = g_XMIdentityR0.v;
    M.r[1] = g_XMIdentityR1.v;
    M.r[2] = g_XMIdentityR2.v;
    M.r[3] = g_XMIdentityR3.v;
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixSet
(
    float m00, float m01, float m02, float m03,
    float m10, float m11, float m12, float m13,
    float m20, float m21, float m22, float m23,
    float m30, float m31, float m32, float m33
) noexcept
{
    XMMATRIX M;
#if defined(_XM_NO_INTRINSICS_)
    M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03;
    M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13;
    M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23;
    M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33;
#else
    M.r[0] = XMVectorSet(m00, m01, m02, m03);
    M.r[1] = XMVectorSet(m10, m11, m12, m13);
    M.r[2] = XMVectorSet(m20, m21, m22, m23);
    M.r[3] = XMVectorSet(m30, m31, m32, m33);
#endif
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixTranslation
(
    float OffsetX,
    float OffsetY,
    float OffsetZ
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMMATRIX M;
    M.m[0][0] = 1.0f;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = 1.0f;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = 1.0f;
    M.m[2][3] = 0.0f;

    M.m[3][0] = OffsetX;
    M.m[3][1] = OffsetY;
    M.m[3][2] = OffsetZ;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX M;
    M.r[0] = g_XMIdentityR0.v;
    M.r[1] = g_XMIdentityR1.v;
    M.r[2] = g_XMIdentityR2.v;
    M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f);
    return M;
#endif
}


//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixTranslationFromVector(FXMVECTOR Offset) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMMATRIX M;
    M.m[0][0] = 1.0f;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = 1.0f;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = 1.0f;
    M.m[2][3] = 0.0f;

    M.m[3][0] = Offset.vector4_f32[0];
    M.m[3][1] = Offset.vector4_f32[1];
    M.m[3][2] = Offset.vector4_f32[2];
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX M;
    M.r[0] = g_XMIdentityR0.v;
    M.r[1] = g_XMIdentityR1.v;
    M.r[2] = g_XMIdentityR2.v;
    M.r[3] = XMVectorSelect(g_XMIdentityR3.v, Offset, g_XMSelect1110.v);
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixScaling
(
    float ScaleX,
    float ScaleY,
    float ScaleZ
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMMATRIX M;
    M.m[0][0] = ScaleX;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = ScaleY;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = ScaleZ;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    const XMVECTOR Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(ScaleX, Zero, 0);
    M.r[1] = vsetq_lane_f32(ScaleY, Zero, 1);
    M.r[2] = vsetq_lane_f32(ScaleZ, Zero, 2);
    M.r[3] = g_XMIdentityR3.v;
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    M.r[0] = _mm_set_ps(0, 0, 0, ScaleX);
    M.r[1] = _mm_set_ps(0, 0, ScaleY, 0);
    M.r[2] = _mm_set_ps(0, ScaleZ, 0, 0);
    M.r[3] = g_XMIdentityR3.v;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixScalingFromVector(FXMVECTOR Scale) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMMATRIX M;
    M.m[0][0] = Scale.vector4_f32[0];
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = Scale.vector4_f32[1];
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = Scale.vector4_f32[2];
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMMATRIX M;
    M.r[0] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskX));
    M.r[1] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskY));
    M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Scale), g_XMMaskZ));
    M.r[3] = g_XMIdentityR3.v;
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    M.r[0] = _mm_and_ps(Scale, g_XMMaskX);
    M.r[1] = _mm_and_ps(Scale, g_XMMaskY);
    M.r[2] = _mm_and_ps(Scale, g_XMMaskZ);
    M.r[3] = g_XMIdentityR3.v;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationX(float Angle) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    XMMATRIX M;
    M.m[0][0] = 1.0f;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = fCosAngle;
    M.m[1][2] = fSinAngle;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = -fSinAngle;
    M.m[2][2] = fCosAngle;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    const float32x4_t Zero = vdupq_n_f32(0);

    float32x4_t T1 = vsetq_lane_f32(fCosAngle, Zero, 1);
    T1 = vsetq_lane_f32(fSinAngle, T1, 2);

    float32x4_t T2 = vsetq_lane_f32(-fSinAngle, Zero, 1);
    T2 = vsetq_lane_f32(fCosAngle, T2, 2);

    XMMATRIX M;
    M.r[0] = g_XMIdentityR0.v;
    M.r[1] = T1;
    M.r[2] = T2;
    M.r[3] = g_XMIdentityR3.v;
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    float    SinAngle;
    float    CosAngle;
    XMScalarSinCos(&SinAngle, &CosAngle, Angle);

    XMVECTOR vSin = _mm_set_ss(SinAngle);
    XMVECTOR vCos = _mm_set_ss(CosAngle);
    // x = 0,y = cos,z = sin, w = 0
    vCos = _mm_shuffle_ps(vCos, vSin, _MM_SHUFFLE(3, 0, 0, 3));
    XMMATRIX M;
    M.r[0] = g_XMIdentityR0;
    M.r[1] = vCos;
    // x = 0,y = sin,z = cos, w = 0
    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 1, 2, 0));
    // x = 0,y = -sin,z = cos, w = 0
    vCos = _mm_mul_ps(vCos, g_XMNegateY);
    M.r[2] = vCos;
    M.r[3] = g_XMIdentityR3;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationY(float Angle) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    XMMATRIX M;
    M.m[0][0] = fCosAngle;
    M.m[0][1] = 0.0f;
    M.m[0][2] = -fSinAngle;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = 1.0f;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = fSinAngle;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fCosAngle;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    const float32x4_t Zero = vdupq_n_f32(0);

    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
    T0 = vsetq_lane_f32(-fSinAngle, T0, 2);

    float32x4_t T2 = vsetq_lane_f32(fSinAngle, Zero, 0);
    T2 = vsetq_lane_f32(fCosAngle, T2, 2);

    XMMATRIX M;
    M.r[0] = T0;
    M.r[1] = g_XMIdentityR1.v;
    M.r[2] = T2;
    M.r[3] = g_XMIdentityR3.v;
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    float    SinAngle;
    float    CosAngle;
    XMScalarSinCos(&SinAngle, &CosAngle, Angle);

    XMVECTOR vSin = _mm_set_ss(SinAngle);
    XMVECTOR vCos = _mm_set_ss(CosAngle);
    // x = sin,y = 0,z = cos, w = 0
    vSin = _mm_shuffle_ps(vSin, vCos, _MM_SHUFFLE(3, 0, 3, 0));
    XMMATRIX M;
    M.r[2] = vSin;
    M.r[1] = g_XMIdentityR1;
    // x = cos,y = 0,z = sin, w = 0
    vSin = XM_PERMUTE_PS(vSin, _MM_SHUFFLE(3, 0, 1, 2));
    // x = cos,y = 0,z = -sin, w = 0
    vSin = _mm_mul_ps(vSin, g_XMNegateZ);
    M.r[0] = vSin;
    M.r[3] = g_XMIdentityR3;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationZ(float Angle) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    XMMATRIX M;
    M.m[0][0] = fCosAngle;
    M.m[0][1] = fSinAngle;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = -fSinAngle;
    M.m[1][1] = fCosAngle;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = 1.0f;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    const float32x4_t Zero = vdupq_n_f32(0);

    float32x4_t T0 = vsetq_lane_f32(fCosAngle, Zero, 0);
    T0 = vsetq_lane_f32(fSinAngle, T0, 1);

    float32x4_t T1 = vsetq_lane_f32(-fSinAngle, Zero, 0);
    T1 = vsetq_lane_f32(fCosAngle, T1, 1);

    XMMATRIX M;
    M.r[0] = T0;
    M.r[1] = T1;
    M.r[2] = g_XMIdentityR2.v;
    M.r[3] = g_XMIdentityR3.v;
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    float    SinAngle;
    float    CosAngle;
    XMScalarSinCos(&SinAngle, &CosAngle, Angle);

    XMVECTOR vSin = _mm_set_ss(SinAngle);
    XMVECTOR vCos = _mm_set_ss(CosAngle);
    // x = cos,y = sin,z = 0, w = 0
    vCos = _mm_unpacklo_ps(vCos, vSin);
    XMMATRIX M;
    M.r[0] = vCos;
    // x = sin,y = cos,z = 0, w = 0
    vCos = XM_PERMUTE_PS(vCos, _MM_SHUFFLE(3, 2, 0, 1));
    // x = cos,y = -sin,z = 0, w = 0
    vCos = _mm_mul_ps(vCos, g_XMNegateX);
    M.r[1] = vCos;
    M.r[2] = g_XMIdentityR2;
    M.r[3] = g_XMIdentityR3;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYaw
(
    float Pitch,
    float Yaw,
    float Roll
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float cp = cosf(Pitch);
    float sp = sinf(Pitch);

    float cy = cosf(Yaw);
    float sy = sinf(Yaw);

    float cr = cosf(Roll);
    float sr = sinf(Roll);

    XMMATRIX M;
    M.m[0][0] = cr * cy + sr * sp * sy;
    M.m[0][1] = sr * cp;
    M.m[0][2] = sr * sp * cy - cr * sy;
    M.m[0][3] = 0.0f;

    M.m[1][0] = cr * sp * sy - sr * cy;
    M.m[1][1] = cr * cp;
    M.m[1][2] = sr * sy + cr * sp * cy;
    M.m[1][3] = 0.0f;

    M.m[2][0] = cp * sy;
    M.m[2][1] = -sp;
    M.m[2][2] = cp * cy;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;
#else
    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
    return XMMatrixRotationRollPitchYawFromVector(Angles);
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationRollPitchYawFromVector
(
    FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined>
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float cp = cosf(Angles.vector4_f32[0]);
    float sp = sinf(Angles.vector4_f32[0]);

    float cy = cosf(Angles.vector4_f32[1]);
    float sy = sinf(Angles.vector4_f32[1]);

    float cr = cosf(Angles.vector4_f32[2]);
    float sr = sinf(Angles.vector4_f32[2]);

    XMMATRIX M;
    M.m[0][0] = cr * cy + sr * sp * sy;
    M.m[0][1] = sr * cp;
    M.m[0][2] = sr * sp * cy - cr * sy;
    M.m[0][3] = 0.0f;

    M.m[1][0] = cr * sp * sy - sr * cy;
    M.m[1][1] = cr * cp;
    M.m[1][2] = sr * sy + cr * sp * cy;
    M.m[1][3] = 0.0f;

    M.m[2][0] = cp * sy;
    M.m[2][1] = -sp;
    M.m[2][2] = cp * cy;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = 0.0f;
    M.m[3][3] = 1.0f;
    return M;
#else
    static const XMVECTORF32  Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } };

    XMVECTOR SinAngles, CosAngles;
    XMVectorSinCos(&SinAngles, &CosAngles, Angles);

    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_1X>(SinAngles, CosAngles);
    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1Y>(SinAngles, CosAngles);
    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(SinAngles, CosAngles);
    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Y>(SinAngles, CosAngles);
    XMVECTOR P2 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
    XMVECTOR P3 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
    XMVECTOR Y2 = XMVectorSplatX(SinAngles);
    XMVECTOR NS = XMVectorNegate(SinAngles);

    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
    Q1 = XMVectorMultiply(Q1, Y1);
    XMVECTOR Q2 = XMVectorMultiply(P2, Y2);
    Q2 = XMVectorMultiplyAdd(Q2, P3, Q1);

    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0W>(Q0, Q2);
    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_0W>(Q0, Q2);
    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_0W, XM_PERMUTE_0W>(Q0, NS);

    XMMATRIX M;
    M.r[0] = XMVectorSelect(g_XMZero, V0, g_XMSelect1110.v);
    M.r[1] = XMVectorSelect(g_XMZero, V1, g_XMSelect1110.v);
    M.r[2] = XMVectorSelect(g_XMZero, V2, g_XMSelect1110.v);
    M.r[3] = g_XMIdentityR3;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationNormal
(
    FXMVECTOR NormalAxis,
    float     Angle
) noexcept
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f);

    XMVECTOR C2 = XMVectorSplatZ(A);
    XMVECTOR C1 = XMVectorSplatY(A);
    XMVECTOR C0 = XMVectorSplatX(A);

    XMVECTOR N0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(NormalAxis);
    XMVECTOR N1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(NormalAxis);

    XMVECTOR V0 = XMVectorMultiply(C2, N0);
    V0 = XMVectorMultiply(V0, N1);

    XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis);
    R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1);

    XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0);
    XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0);

    V0 = XMVectorSelect(A, R0, g_XMSelect1110.v);
    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(R1, R2);
    XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(R1, R2);

    XMMATRIX M;
    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(V0, V1);
    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(V0, V1);
    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(V0, V2);
    M.r[3] = g_XMIdentityR3.v;
    return M;

#elif defined(_XM_SSE_INTRINSICS_)
    float    fSinAngle;
    float    fCosAngle;
    XMScalarSinCos(&fSinAngle, &fCosAngle, Angle);

    XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle);
    XMVECTOR C1 = _mm_set_ps1(fCosAngle);
    XMVECTOR C0 = _mm_set_ps1(fSinAngle);

    XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 0, 2, 1));
    XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis, _MM_SHUFFLE(3, 1, 0, 2));

    XMVECTOR V0 = _mm_mul_ps(C2, N0);
    V0 = _mm_mul_ps(V0, N1);

    XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis);
    R0 = _mm_mul_ps(R0, NormalAxis);
    R0 = _mm_add_ps(R0, C1);

    XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis);
    R1 = _mm_add_ps(R1, V0);
    XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis);
    R2 = _mm_sub_ps(V0, R2);

    V0 = _mm_and_ps(R0, g_XMMask3);
    XMVECTOR V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 1, 2, 0));
    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(0, 3, 2, 1));
    XMVECTOR V2 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(0, 0, 1, 1));
    V2 = XM_PERMUTE_PS(V2, _MM_SHUFFLE(2, 0, 2, 0));

    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(1, 0, 3, 0));
    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 2, 0));

    XMMATRIX M;
    M.r[0] = R2;

    R2 = _mm_shuffle_ps(V0, V1, _MM_SHUFFLE(3, 2, 3, 1));
    R2 = XM_PERMUTE_PS(R2, _MM_SHUFFLE(1, 3, 0, 2));
    M.r[1] = R2;

    V2 = _mm_shuffle_ps(V2, V0, _MM_SHUFFLE(3, 2, 1, 0));
    M.r[2] = V2;
    M.r[3] = g_XMIdentityR3.v;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationAxis
(
    FXMVECTOR Axis,
    float     Angle
) noexcept
{
    assert(!XMVector3Equal(Axis, XMVectorZero()));
    assert(!XMVector3IsInfinite(Axis));

    XMVECTOR Normal = XMVector3Normalize(Axis);
    return XMMatrixRotationNormal(Normal, Angle);
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixRotationQuaternion(FXMVECTOR Quaternion) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    float qx = Quaternion.vector4_f32[0];
    float qxx = qx * qx;

    float qy = Quaternion.vector4_f32[1];
    float qyy = qy * qy;

    float qz = Quaternion.vector4_f32[2];
    float qzz = qz * qz;

    float qw = Quaternion.vector4_f32[3];

    XMMATRIX M;
    M.m[0][0] = 1.f - 2.f * qyy - 2.f * qzz;
    M.m[0][1] = 2.f * qx * qy + 2.f * qz * qw;
    M.m[0][2] = 2.f * qx * qz - 2.f * qy * qw;
    M.m[0][3] = 0.f;

    M.m[1][0] = 2.f * qx * qy - 2.f * qz * qw;
    M.m[1][1] = 1.f - 2.f * qxx - 2.f * qzz;
    M.m[1][2] = 2.f * qy * qz + 2.f * qx * qw;
    M.m[1][3] = 0.f;

    M.m[2][0] = 2.f * qx * qz + 2.f * qy * qw;
    M.m[2][1] = 2.f * qy * qz - 2.f * qx * qw;
    M.m[2][2] = 1.f - 2.f * qxx - 2.f * qyy;
    M.m[2][3] = 0.f;

    M.m[3][0] = 0.f;
    M.m[3][1] = 0.f;
    M.m[3][2] = 0.f;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } };

    XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion);
    XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0);

    XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W>(Q1, Constant1110.v);
    XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W>(Q1, Constant1110.v);
    XMVECTOR R0 = XMVectorSubtract(Constant1110, V0);
    R0 = XMVectorSubtract(R0, V1);

    V0 = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Quaternion);
    V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Q0);
    V0 = XMVectorMultiply(V0, V1);

    V1 = XMVectorSplatW(Quaternion);
    XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(Q0);
    V1 = XMVectorMultiply(V1, V2);

    XMVECTOR R1 = XMVectorAdd(V0, V1);
    XMVECTOR R2 = XMVectorSubtract(V0, V1);

    V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z>(R1, R2);
    V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z>(R1, R2);

    XMMATRIX M;
    M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(R0, V0);
    M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(R0, V0);
    M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(R0, V1);
    M.r[3] = g_XMIdentityR3.v;
    return M;

#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32  Constant1110 = { { { 1.0f, 1.0f, 1.0f, 0.0f } } };

    XMVECTOR Q0 = _mm_add_ps(Quaternion, Quaternion);
    XMVECTOR Q1 = _mm_mul_ps(Quaternion, Q0);

    XMVECTOR V0 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 0, 0, 1));
    V0 = _mm_and_ps(V0, g_XMMask3);
    XMVECTOR V1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(3, 1, 2, 2));
    V1 = _mm_and_ps(V1, g_XMMask3);
    XMVECTOR R0 = _mm_sub_ps(Constant1110, V0);
    R0 = _mm_sub_ps(R0, V1);

    V0 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 1, 0, 0));
    V1 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 2, 1, 2));
    V0 = _mm_mul_ps(V0, V1);

    V1 = XM_PERMUTE_PS(Quaternion, _MM_SHUFFLE(3, 3, 3, 3));
    XMVECTOR V2 = XM_PERMUTE_PS(Q0, _MM_SHUFFLE(3, 0, 2, 1));
    V1 = _mm_mul_ps(V1, V2);

    XMVECTOR R1 = _mm_add_ps(V0, V1);
    XMVECTOR R2 = _mm_sub_ps(V0, V1);

    V0 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(1, 0, 2, 1));
    V0 = XM_PERMUTE_PS(V0, _MM_SHUFFLE(1, 3, 2, 0));
    V1 = _mm_shuffle_ps(R1, R2, _MM_SHUFFLE(2, 2, 0, 0));
    V1 = XM_PERMUTE_PS(V1, _MM_SHUFFLE(2, 0, 2, 0));

    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(1, 0, 3, 0));
    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 2, 0));

    XMMATRIX M;
    M.r[0] = Q1;

    Q1 = _mm_shuffle_ps(R0, V0, _MM_SHUFFLE(3, 2, 3, 1));
    Q1 = XM_PERMUTE_PS(Q1, _MM_SHUFFLE(1, 3, 0, 2));
    M.r[1] = Q1;

    Q1 = _mm_shuffle_ps(V1, R0, _MM_SHUFFLE(3, 2, 1, 0));
    M.r[2] = Q1;
    M.r[3] = g_XMIdentityR3;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixTransformation2D
(
    FXMVECTOR ScalingOrigin,
    float     ScalingOrientation,
    FXMVECTOR Scaling,
    FXMVECTOR RotationOrigin,
    float     Rotation,
    GXMVECTOR Translation
) noexcept
{
    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;

    XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v);
    XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin);

    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
    XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation);
    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);

    XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
    M = XMMatrixMultiply(M, MScaling);
    M = XMMatrixMultiply(M, MScalingOrientation);
    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
    M = XMMatrixMultiply(M, MRotation);
    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
    M.r[3] = XMVectorAdd(M.r[3], VTranslation);

    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixTransformation
(
    FXMVECTOR ScalingOrigin,
    FXMVECTOR ScalingOrientationQuaternion,
    FXMVECTOR Scaling,
    GXMVECTOR RotationOrigin,
    HXMVECTOR RotationQuaternion,
    HXMVECTOR Translation
) noexcept
{
    // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation *
    //         MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;

    XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v);
    XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin);

    XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin);
    XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion);
    XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation);
    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);

    XMMATRIX M;
    M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT);
    M = XMMatrixMultiply(M, MScaling);
    M = XMMatrixMultiply(M, MScalingOrientation);
    M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin);
    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
    M = XMMatrixMultiply(M, MRotation);
    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation2D
(
    FXMVECTOR Scaling,
    FXMVECTOR RotationOrigin,
    float     Rotation,
    FXMVECTOR Translation
) noexcept
{
    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;

    XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v);
    XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling);
    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v);
    XMMATRIX MRotation = XMMatrixRotationZ(Rotation);
    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation, g_XMSelect1100.v);

    XMMATRIX M;
    M = MScaling;
    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
    M = XMMatrixMultiply(M, MRotation);
    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixAffineTransformation
(
    FXMVECTOR Scaling,
    FXMVECTOR RotationOrigin,
    FXMVECTOR RotationQuaternion,
    GXMVECTOR Translation
) noexcept
{
    // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation;

    XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling);
    XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v);
    XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion);
    XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v);

    XMMATRIX M;
    M = MScaling;
    M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin);
    M = XMMatrixMultiply(M, MRotation);
    M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin);
    M.r[3] = XMVectorAdd(M.r[3], VTranslation);
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixReflect(FXMVECTOR ReflectionPlane) noexcept
{
    assert(!XMVector3Equal(ReflectionPlane, XMVectorZero()));
    assert(!XMPlaneIsInfinite(ReflectionPlane));

    static const XMVECTORF32 NegativeTwo = { { { -2.0f, -2.0f, -2.0f, 0.0f } } };

    XMVECTOR P = XMPlaneNormalize(ReflectionPlane);
    XMVECTOR S = XMVectorMultiply(P, NegativeTwo);

    XMVECTOR A = XMVectorSplatX(P);
    XMVECTOR B = XMVectorSplatY(P);
    XMVECTOR C = XMVectorSplatZ(P);
    XMVECTOR D = XMVectorSplatW(P);

    XMMATRIX M;
    M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v);
    M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v);
    M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v);
    M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v);
    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixShadow
(
    FXMVECTOR ShadowPlane,
    FXMVECTOR LightPosition
) noexcept
{
    static const XMVECTORU32 Select0001 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1 } } };

    assert(!XMVector3Equal(ShadowPlane, XMVectorZero()));
    assert(!XMPlaneIsInfinite(ShadowPlane));

    XMVECTOR P = XMPlaneNormalize(ShadowPlane);
    XMVECTOR Dot = XMPlaneDot(P, LightPosition);
    P = XMVectorNegate(P);
    XMVECTOR D = XMVectorSplatW(P);
    XMVECTOR C = XMVectorSplatZ(P);
    XMVECTOR B = XMVectorSplatY(P);
    XMVECTOR A = XMVectorSplatX(P);
    Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v);

    XMMATRIX M;
    M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot);
    Dot = XMVectorRotateLeft(Dot, 1);
    M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot);
    Dot = XMVectorRotateLeft(Dot, 1);
    M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot);
    Dot = XMVectorRotateLeft(Dot, 1);
    M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot);
    return M;
}

//------------------------------------------------------------------------------
// View and projection initialization operations
//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixLookAtLH
(
    FXMVECTOR EyePosition,
    FXMVECTOR FocusPosition,
    FXMVECTOR UpDirection
) noexcept
{
    XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition);
    return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection);
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixLookAtRH
(
    FXMVECTOR EyePosition,
    FXMVECTOR FocusPosition,
    FXMVECTOR UpDirection
) noexcept
{
    XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition);
    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixLookToLH
(
    FXMVECTOR EyePosition,
    FXMVECTOR EyeDirection,
    FXMVECTOR UpDirection
) noexcept
{
    assert(!XMVector3Equal(EyeDirection, XMVectorZero()));
    assert(!XMVector3IsInfinite(EyeDirection));
    assert(!XMVector3Equal(UpDirection, XMVectorZero()));
    assert(!XMVector3IsInfinite(UpDirection));

    XMVECTOR R2 = XMVector3Normalize(EyeDirection);

    XMVECTOR R0 = XMVector3Cross(UpDirection, R2);
    R0 = XMVector3Normalize(R0);

    XMVECTOR R1 = XMVector3Cross(R2, R0);

    XMVECTOR NegEyePosition = XMVectorNegate(EyePosition);

    XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition);
    XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition);
    XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition);

    XMMATRIX M;
    M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v);
    M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v);
    M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v);
    M.r[3] = g_XMIdentityR3.v;

    M = XMMatrixTranspose(M);

    return M;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixLookToRH
(
    FXMVECTOR EyePosition,
    FXMVECTOR EyeDirection,
    FXMVECTOR UpDirection
) noexcept
{
    XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection);
    return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection);
}

//------------------------------------------------------------------------------

#ifdef _PREFAST_
#pragma prefast(push)
#pragma prefast(disable:28931, "PREfast noise: Esp:1266")
#endif

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveLH
(
    float ViewWidth,
    float ViewHeight,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (FarZ - NearZ);

    XMMATRIX M;
    M.m[0][0] = TwoNearZ / ViewWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = TwoNearZ / ViewHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = -fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (FarZ - NearZ);
    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (FarZ - NearZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        TwoNearZ / ViewWidth,
        TwoNearZ / ViewHeight,
        fRange,
        -fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // TwoNearZ / ViewWidth,0,0,0
    M.r[0] = vTemp;
    // 0,TwoNearZ / ViewHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=-fRange * NearZ,0,1.0f
    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,1.0f
    vTemp = _mm_setzero_ps();
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,-fRange * NearZ,0
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveRH
(
    float ViewWidth,
    float ViewHeight,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (NearZ - FarZ);

    XMMATRIX M;
    M.m[0][0] = TwoNearZ / ViewWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = TwoNearZ / ViewHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = -1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (NearZ - FarZ);
    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(TwoNearZ / ViewHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float TwoNearZ = NearZ + NearZ;
    float fRange = FarZ / (NearZ - FarZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        TwoNearZ / ViewWidth,
        TwoNearZ / ViewHeight,
        fRange,
        fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // TwoNearZ / ViewWidth,0,0,0
    M.r[0] = vTemp;
    // 0,TwoNearZ / ViewHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=-fRange * NearZ,0,-1.0f
    vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,-1.0f
    vTemp = _mm_setzero_ps();
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,-fRange * NearZ,0
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovLH
(
    float FovAngleY,
    float AspectRatio,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
    float fRange = FarZ / (FarZ - NearZ);

    XMMATRIX M;
    M.m[0][0] = Width;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = Height;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = -fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    float fRange = FarZ / (FarZ - NearZ);
    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, g_XMIdentityR3.v, 2);
    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    float fRange = FarZ / (FarZ - NearZ);
    // Note: This is recorded on the stack
    float Height = CosFov / SinFov;
    XMVECTOR rMem = {
        Height / AspectRatio,
        Height,
        fRange,
        -fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // Height / AspectRatio,0,0,0
    XMMATRIX M;
    M.r[0] = vTemp;
    // 0,Height,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=-fRange * NearZ,0,1.0f
    vTemp = _mm_setzero_ps();
    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,1.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,-fRange * NearZ,0.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveFovRH
(
    float FovAngleY,
    float AspectRatio,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f));
    assert(!XMScalarNearEqual(AspectRatio, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);

    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
    float fRange = FarZ / (NearZ - FarZ);

    XMMATRIX M;
    M.m[0][0] = Width;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = Height;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = -1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
    float fRange = FarZ / (NearZ - FarZ);
    float Height = CosFov / SinFov;
    float Width = Height / AspectRatio;
    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(Width, Zero, 0);
    M.r[1] = vsetq_lane_f32(Height, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, g_XMNegIdentityR3.v, 2);
    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    float    SinFov;
    float    CosFov;
    XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
    float fRange = FarZ / (NearZ - FarZ);
    // Note: This is recorded on the stack
    float Height = CosFov / SinFov;
    XMVECTOR rMem = {
        Height / AspectRatio,
        Height,
        fRange,
        fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // Height / AspectRatio,0,0,0
    XMMATRIX M;
    M.r[0] = vTemp;
    // 0,Height,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=-fRange * NearZ,0,-1.0f
    vTemp = _mm_setzero_ps();
    vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,-1.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,fRange * NearZ,0.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterLH
(
    float ViewLeft,
    float ViewRight,
    float ViewBottom,
    float ViewTop,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (FarZ - NearZ);

    XMMATRIX M;
    M.m[0][0] = TwoNearZ * ReciprocalWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = TwoNearZ * ReciprocalHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
    M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
    M.m[2][2] = fRange;
    M.m[2][3] = 1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = -fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (FarZ - NearZ);
    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
        -(ViewTop + ViewBottom) * ReciprocalHeight,
        fRange,
        1.0f);
    M.r[3] = vsetq_lane_f32(-fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (FarZ - NearZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        TwoNearZ * ReciprocalWidth,
        TwoNearZ * ReciprocalHeight,
        -fRange * NearZ,
        0
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // TwoNearZ*ReciprocalWidth,0,0,0
    M.r[0] = vTemp;
    // 0,TwoNearZ*ReciprocalHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // 0,0,fRange,1.0f
    M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
        -(ViewTop + ViewBottom) * ReciprocalHeight,
        fRange,
        1.0f);
    // 0,0,-fRange * NearZ,0.0f
    vValues = _mm_and_ps(vValues, g_XMMaskZ);
    M.r[3] = vValues;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixPerspectiveOffCenterRH
(
    float ViewLeft,
    float ViewRight,
    float ViewBottom,
    float ViewTop,
    float NearZ,
    float FarZ
) noexcept
{
    assert(NearZ > 0.f && FarZ > 0.f);
    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (NearZ - FarZ);

    XMMATRIX M;
    M.m[0][0] = TwoNearZ * ReciprocalWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = TwoNearZ * ReciprocalHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth;
    M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight;
    M.m[2][2] = fRange;
    M.m[2][3] = -1.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = fRange * NearZ;
    M.m[3][3] = 0.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (NearZ - FarZ);
    const float32x4_t Zero = vdupq_n_f32(0);

    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(TwoNearZ * ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(TwoNearZ * ReciprocalHeight, Zero, 1);
    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
        (ViewTop + ViewBottom) * ReciprocalHeight,
        fRange,
        -1.0f);
    M.r[3] = vsetq_lane_f32(fRange * NearZ, Zero, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float TwoNearZ = NearZ + NearZ;
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = FarZ / (NearZ - FarZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        TwoNearZ * ReciprocalWidth,
        TwoNearZ * ReciprocalHeight,
        fRange * NearZ,
        0
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // TwoNearZ*ReciprocalWidth,0,0,0
    M.r[0] = vTemp;
    // 0,TwoNearZ*ReciprocalHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // 0,0,fRange,1.0f
    M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth,
        (ViewTop + ViewBottom) * ReciprocalHeight,
        fRange,
        -1.0f);
    // 0,0,-fRange * NearZ,0.0f
    vValues = _mm_and_ps(vValues, g_XMMaskZ);
    M.r[3] = vValues;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixOrthographicLH
(
    float ViewWidth,
    float ViewHeight,
    float NearZ,
    float FarZ
) noexcept
{
    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float fRange = 1.0f / (FarZ - NearZ);

    XMMATRIX M;
    M.m[0][0] = 2.0f / ViewWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = 2.0f / ViewHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = -fRange * NearZ;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fRange = 1.0f / (FarZ - NearZ);

    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
    M.r[3] = vsetq_lane_f32(-fRange * NearZ, g_XMIdentityR3.v, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float fRange = 1.0f / (FarZ - NearZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        2.0f / ViewWidth,
        2.0f / ViewHeight,
        fRange,
        -fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // 2.0f / ViewWidth,0,0,0
    M.r[0] = vTemp;
    // 0,2.0f / ViewHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=-fRange * NearZ,0,1.0f
    vTemp = _mm_setzero_ps();
    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,0.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,-fRange * NearZ,1.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixOrthographicRH
(
    float ViewWidth,
    float ViewHeight,
    float NearZ,
    float FarZ
) noexcept
{
    assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float fRange = 1.0f / (NearZ - FarZ);

    XMMATRIX M;
    M.m[0][0] = 2.0f / ViewWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = 2.0f / ViewHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 0.0f;

    M.m[3][0] = 0.0f;
    M.m[3][1] = 0.0f;
    M.m[3][2] = fRange * NearZ;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float fRange = 1.0f / (NearZ - FarZ);

    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(2.0f / ViewWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(2.0f / ViewHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
    M.r[3] = vsetq_lane_f32(fRange * NearZ, g_XMIdentityR3.v, 2);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float fRange = 1.0f / (NearZ - FarZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        2.0f / ViewWidth,
        2.0f / ViewHeight,
        fRange,
        fRange * NearZ
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // 2.0f / ViewWidth,0,0,0
    M.r[0] = vTemp;
    // 0,2.0f / ViewHeight,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    M.r[1] = vTemp;
    // x=fRange,y=fRange * NearZ,0,1.0f
    vTemp = _mm_setzero_ps();
    vValues = _mm_shuffle_ps(vValues, g_XMIdentityR3, _MM_SHUFFLE(3, 2, 3, 2));
    // 0,0,fRange,0.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 0, 0, 0));
    M.r[2] = vTemp;
    // 0,0,fRange * NearZ,1.0f
    vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 1, 0, 0));
    M.r[3] = vTemp;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterLH
(
    float ViewLeft,
    float ViewRight,
    float ViewBottom,
    float ViewTop,
    float NearZ,
    float FarZ
) noexcept
{
    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (FarZ - NearZ);

    XMMATRIX M;
    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 0.0f;

    M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth;
    M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight;
    M.m[3][2] = -fRange * NearZ;
    M.m[3][3] = 1.0f;
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (FarZ - NearZ);
    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
        -(ViewTop + ViewBottom) * ReciprocalHeight,
        -fRange * NearZ,
        1.0f);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (FarZ - NearZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        fReciprocalWidth,
        fReciprocalHeight,
        fRange,
        1.0f
    };
    XMVECTOR rMem2 = {
        -(ViewLeft + ViewRight),
        -(ViewTop + ViewBottom),
        -NearZ,
        1.0f
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // fReciprocalWidth*2,0,0,0
    vTemp = _mm_add_ss(vTemp, vTemp);
    M.r[0] = vTemp;
    // 0,fReciprocalHeight*2,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    vTemp = _mm_add_ps(vTemp, vTemp);
    M.r[1] = vTemp;
    // 0,0,fRange,0.0f
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
    M.r[2] = vTemp;
    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
    vValues = _mm_mul_ps(vValues, rMem2);
    M.r[3] = vValues;
    return M;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMatrixOrthographicOffCenterRH
(
    float ViewLeft,
    float ViewRight,
    float ViewBottom,
    float ViewTop,
    float NearZ,
    float FarZ
) noexcept
{
    assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f));
    assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f));
    assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f));

#if defined(_XM_NO_INTRINSICS_)

    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (NearZ - FarZ);

    XMMATRIX M;
    M.m[0][0] = ReciprocalWidth + ReciprocalWidth;
    M.m[0][1] = 0.0f;
    M.m[0][2] = 0.0f;
    M.m[0][3] = 0.0f;

    M.m[1][0] = 0.0f;
    M.m[1][1] = ReciprocalHeight + ReciprocalHeight;
    M.m[1][2] = 0.0f;
    M.m[1][3] = 0.0f;

    M.m[2][0] = 0.0f;
    M.m[2][1] = 0.0f;
    M.m[2][2] = fRange;
    M.m[2][3] = 0.0f;

    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
        -(ViewTop + ViewBottom) * ReciprocalHeight,
        fRange * NearZ,
        1.0f);
    return M;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (NearZ - FarZ);
    const float32x4_t Zero = vdupq_n_f32(0);
    XMMATRIX M;
    M.r[0] = vsetq_lane_f32(ReciprocalWidth + ReciprocalWidth, Zero, 0);
    M.r[1] = vsetq_lane_f32(ReciprocalHeight + ReciprocalHeight, Zero, 1);
    M.r[2] = vsetq_lane_f32(fRange, Zero, 2);
    M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth,
        -(ViewTop + ViewBottom) * ReciprocalHeight,
        fRange * NearZ,
        1.0f);
    return M;
#elif defined(_XM_SSE_INTRINSICS_)
    XMMATRIX M;
    float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft);
    float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom);
    float fRange = 1.0f / (NearZ - FarZ);
    // Note: This is recorded on the stack
    XMVECTOR rMem = {
        fReciprocalWidth,
        fReciprocalHeight,
        fRange,
        1.0f
    };
    XMVECTOR rMem2 = {
        -(ViewLeft + ViewRight),
        -(ViewTop + ViewBottom),
        NearZ,
        1.0f
    };
    // Copy from memory to SSE register
    XMVECTOR vValues = rMem;
    XMVECTOR vTemp = _mm_setzero_ps();
    // Copy x only
    vTemp = _mm_move_ss(vTemp, vValues);
    // fReciprocalWidth*2,0,0,0
    vTemp = _mm_add_ss(vTemp, vTemp);
    M.r[0] = vTemp;
    // 0,fReciprocalHeight*2,0,0
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskY);
    vTemp = _mm_add_ps(vTemp, vTemp);
    M.r[1] = vTemp;
    // 0,0,fRange,0.0f
    vTemp = vValues;
    vTemp = _mm_and_ps(vTemp, g_XMMaskZ);
    M.r[2] = vTemp;
    // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f
    vValues = _mm_mul_ps(vValues, rMem2);
    M.r[3] = vValues;
    return M;
#endif
}

#ifdef _PREFAST_
#pragma prefast(pop)
#endif

/****************************************************************************
 *
 * XMMATRIX operators and methods
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------

inline XMMATRIX::XMMATRIX
(
    float m00, float m01, float m02, float m03,
    float m10, float m11, float m12, float m13,
    float m20, float m21, float m22, float m23,
    float m30, float m31, float m32, float m33
) noexcept
{
    r[0] = XMVectorSet(m00, m01, m02, m03);
    r[1] = XMVectorSet(m10, m11, m12, m13);
    r[2] = XMVectorSet(m20, m21, m22, m23);
    r[3] = XMVectorSet(m30, m31, m32, m33);
}

//------------------------------------------------------------------------------

inline XMMATRIX::XMMATRIX(const float* pArray) noexcept
{
    assert(pArray != nullptr);
    r[0] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray));
    r[1] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 4));
    r[2] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 8));
    r[3] = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray + 12));
}

//------------------------------------------------------------------------------

inline XMMATRIX XMMATRIX::operator- () const noexcept
{
    XMMATRIX R;
    R.r[0] = XMVectorNegate(r[0]);
    R.r[1] = XMVectorNegate(r[1]);
    R.r[2] = XMVectorNegate(r[2]);
    R.r[3] = XMVectorNegate(r[3]);
    return R;
}

//------------------------------------------------------------------------------

inline XMMATRIX& XM_CALLCONV XMMATRIX::operator+= (FXMMATRIX M) noexcept
{
    r[0] = XMVectorAdd(r[0], M.r[0]);
    r[1] = XMVectorAdd(r[1], M.r[1]);
    r[2] = XMVectorAdd(r[2], M.r[2]);
    r[3] = XMVectorAdd(r[3], M.r[3]);
    return *this;
}

//------------------------------------------------------------------------------

inline XMMATRIX& XM_CALLCONV XMMATRIX::operator-= (FXMMATRIX M) noexcept
{
    r[0] = XMVectorSubtract(r[0], M.r[0]);
    r[1] = XMVectorSubtract(r[1], M.r[1]);
    r[2] = XMVectorSubtract(r[2], M.r[2]);
    r[3] = XMVectorSubtract(r[3], M.r[3]);
    return *this;
}

//------------------------------------------------------------------------------

inline XMMATRIX& XM_CALLCONV XMMATRIX::operator*=(FXMMATRIX M) noexcept
{
    *this = XMMatrixMultiply(*this, M);
    return *this;
}

//------------------------------------------------------------------------------

inline XMMATRIX& XMMATRIX::operator*= (float S) noexcept
{
    r[0] = XMVectorScale(r[0], S);
    r[1] = XMVectorScale(r[1], S);
    r[2] = XMVectorScale(r[2], S);
    r[3] = XMVectorScale(r[3], S);
    return *this;
}

//------------------------------------------------------------------------------

inline XMMATRIX& XMMATRIX::operator/= (float S) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTOR vS = XMVectorReplicate(S);
    r[0] = XMVectorDivide(r[0], vS);
    r[1] = XMVectorDivide(r[1], vS);
    r[2] = XMVectorDivide(r[2], vS);
    r[3] = XMVectorDivide(r[3], vS);
    return *this;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    float32x4_t vS = vdupq_n_f32(S);
    r[0] = vdivq_f32(r[0], vS);
    r[1] = vdivq_f32(r[1], vS);
    r[2] = vdivq_f32(r[2], vS);
    r[3] = vdivq_f32(r[3], vS);
#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x2_t vS = vdup_n_f32(S);
    float32x2_t R0 = vrecpe_f32(vS);
    float32x2_t S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
    S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
    float32x4_t Reciprocal = vcombine_f32(R0, R0);
    r[0] = vmulq_f32(r[0], Reciprocal);
    r[1] = vmulq_f32(r[1], Reciprocal);
    r[2] = vmulq_f32(r[2], Reciprocal);
    r[3] = vmulq_f32(r[3], Reciprocal);
#endif
    return *this;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 vS = _mm_set_ps1(S);
    r[0] = _mm_div_ps(r[0], vS);
    r[1] = _mm_div_ps(r[1], vS);
    r[2] = _mm_div_ps(r[2], vS);
    r[3] = _mm_div_ps(r[3], vS);
    return *this;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMATRIX::operator+ (FXMMATRIX M) const noexcept
{
    XMMATRIX R;
    R.r[0] = XMVectorAdd(r[0], M.r[0]);
    R.r[1] = XMVectorAdd(r[1], M.r[1]);
    R.r[2] = XMVectorAdd(r[2], M.r[2]);
    R.r[3] = XMVectorAdd(r[3], M.r[3]);
    return R;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMATRIX::operator- (FXMMATRIX M) const noexcept
{
    XMMATRIX R;
    R.r[0] = XMVectorSubtract(r[0], M.r[0]);
    R.r[1] = XMVectorSubtract(r[1], M.r[1]);
    R.r[2] = XMVectorSubtract(r[2], M.r[2]);
    R.r[3] = XMVectorSubtract(r[3], M.r[3]);
    return R;
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV XMMATRIX::operator*(FXMMATRIX M) const noexcept
{
    return XMMatrixMultiply(*this, M);
}

//------------------------------------------------------------------------------

inline XMMATRIX XMMATRIX::operator* (float S) const noexcept
{
    XMMATRIX R;
    R.r[0] = XMVectorScale(r[0], S);
    R.r[1] = XMVectorScale(r[1], S);
    R.r[2] = XMVectorScale(r[2], S);
    R.r[3] = XMVectorScale(r[3], S);
    return R;
}

//------------------------------------------------------------------------------

inline XMMATRIX XMMATRIX::operator/ (float S) const noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTOR vS = XMVectorReplicate(S);
    XMMATRIX R;
    R.r[0] = XMVectorDivide(r[0], vS);
    R.r[1] = XMVectorDivide(r[1], vS);
    R.r[2] = XMVectorDivide(r[2], vS);
    R.r[3] = XMVectorDivide(r[3], vS);
    return R;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
    float32x4_t vS = vdupq_n_f32(S);
    XMMATRIX R;
    R.r[0] = vdivq_f32(r[0], vS);
    R.r[1] = vdivq_f32(r[1], vS);
    R.r[2] = vdivq_f32(r[2], vS);
    R.r[3] = vdivq_f32(r[3], vS);
#else
    // 2 iterations of Newton-Raphson refinement of reciprocal
    float32x2_t vS = vdup_n_f32(S);
    float32x2_t R0 = vrecpe_f32(vS);
    float32x2_t S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
    S0 = vrecps_f32(R0, vS);
    R0 = vmul_f32(S0, R0);
    float32x4_t Reciprocal = vcombine_f32(R0, R0);
    XMMATRIX R;
    R.r[0] = vmulq_f32(r[0], Reciprocal);
    R.r[1] = vmulq_f32(r[1], Reciprocal);
    R.r[2] = vmulq_f32(r[2], Reciprocal);
    R.r[3] = vmulq_f32(r[3], Reciprocal);
#endif
    return R;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 vS = _mm_set_ps1(S);
    XMMATRIX R;
    R.r[0] = _mm_div_ps(r[0], vS);
    R.r[1] = _mm_div_ps(r[1], vS);
    R.r[2] = _mm_div_ps(r[2], vS);
    R.r[3] = _mm_div_ps(r[3], vS);
    return R;
#endif
}

//------------------------------------------------------------------------------

inline XMMATRIX XM_CALLCONV operator*
(
    float S,
    FXMMATRIX M
) noexcept
{
    XMMATRIX R;
    R.r[0] = XMVectorScale(M.r[0], S);
    R.r[1] = XMVectorScale(M.r[1], S);
    R.r[2] = XMVectorScale(M.r[2], S);
    R.r[3] = XMVectorScale(M.r[3], S);
    return R;
}

/****************************************************************************
 *
 * XMFLOAT3X3 operators
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------

inline XMFLOAT3X3::XMFLOAT3X3(const float* pArray) noexcept
{
    assert(pArray != nullptr);
    for (size_t Row = 0; Row < 3; Row++)
    {
        for (size_t Column = 0; Column < 3; Column++)
        {
            m[Row][Column] = pArray[Row * 3 + Column];
        }
    }
}

/****************************************************************************
 *
 * XMFLOAT4X3 operators
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------

inline XMFLOAT4X3::XMFLOAT4X3(const float* pArray) noexcept
{
    assert(pArray != nullptr);

    m[0][0] = pArray[0];
    m[0][1] = pArray[1];
    m[0][2] = pArray[2];

    m[1][0] = pArray[3];
    m[1][1] = pArray[4];
    m[1][2] = pArray[5];

    m[2][0] = pArray[6];
    m[2][1] = pArray[7];
    m[2][2] = pArray[8];

    m[3][0] = pArray[9];
    m[3][1] = pArray[10];
    m[3][2] = pArray[11];
}

/****************************************************************************
*
* XMFLOAT3X4 operators
*
****************************************************************************/

//------------------------------------------------------------------------------

inline XMFLOAT3X4::XMFLOAT3X4(const float* pArray) noexcept
{
    assert(pArray != nullptr);

    m[0][0] = pArray[0];
    m[0][1] = pArray[1];
    m[0][2] = pArray[2];
    m[0][3] = pArray[3];

    m[1][0] = pArray[4];
    m[1][1] = pArray[5];
    m[1][2] = pArray[6];
    m[1][3] = pArray[7];

    m[2][0] = pArray[8];
    m[2][1] = pArray[9];
    m[2][2] = pArray[10];
    m[2][3] = pArray[11];
}

/****************************************************************************
 *
 * XMFLOAT4X4 operators
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------

inline XMFLOAT4X4::XMFLOAT4X4(const float* pArray) noexcept
{
    assert(pArray != nullptr);

    m[0][0] = pArray[0];
    m[0][1] = pArray[1];
    m[0][2] = pArray[2];
    m[0][3] = pArray[3];

    m[1][0] = pArray[4];
    m[1][1] = pArray[5];
    m[1][2] = pArray[6];
    m[1][3] = pArray[7];

    m[2][0] = pArray[8];
    m[2][1] = pArray[9];
    m[2][2] = pArray[10];
    m[2][3] = pArray[11];

    m[3][0] = pArray[12];
    m[3][1] = pArray[13];
    m[3][2] = pArray[14];
    m[3][3] = pArray[15];
}

inline bool XM_CALLCONV XMQuaternionEqual
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
) noexcept
{
    return XMVector4Equal(Q1, Q2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMQuaternionNotEqual
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
) noexcept
{
    return XMVector4NotEqual(Q1, Q2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMQuaternionIsNaN(FXMVECTOR Q) noexcept
{
    return XMVector4IsNaN(Q);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMQuaternionIsInfinite(FXMVECTOR Q) noexcept
{
    return XMVector4IsInfinite(Q);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMQuaternionIsIdentity(FXMVECTOR Q) noexcept
{
    return XMVector4Equal(Q, g_XMIdentityR3.v);
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionDot
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
) noexcept
{
    return XMVector4Dot(Q1, Q2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionMultiply
(
    FXMVECTOR Q1,
    FXMVECTOR Q2
) noexcept
{
    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)

    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
            (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
            (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
            (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
    static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
    static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } };

    float32x2_t Q2L = vget_low_f32(Q2);
    float32x2_t Q2H = vget_high_f32(Q2);

    float32x4_t Q2X = vdupq_lane_f32(Q2L, 0);
    float32x4_t Q2Y = vdupq_lane_f32(Q2L, 1);
    float32x4_t Q2Z = vdupq_lane_f32(Q2H, 0);
    XMVECTOR vResult = vmulq_lane_f32(Q1, Q2H, 1);

    // Mul by Q1WZYX
    float32x4_t vTemp = vrev64q_f32(Q1);
    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
    Q2X = vmulq_f32(Q2X, vTemp);
    vResult = vmlaq_f32(vResult, Q2X, ControlWZYX);

    // Mul by Q1ZWXY
    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
    Q2Y = vmulq_f32(Q2Y, vTemp);
    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);

    // Mul by Q1YXWZ
    vTemp = vreinterpretq_f32_u32(vrev64q_u32(vreinterpretq_u32_f32(vTemp)));
    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
    Q2Z = vmulq_f32(Q2Z, vTemp);
    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 ControlWZYX = { { { 1.0f, -1.0f, 1.0f, -1.0f } } };
    static const XMVECTORF32 ControlZWXY = { { { 1.0f, 1.0f, -1.0f, -1.0f } } };
    static const XMVECTORF32 ControlYXWZ = { { { -1.0f, 1.0f, 1.0f, -1.0f } } };
    // Copy to SSE registers and use as few as possible for x86
    XMVECTOR Q2X = Q2;
    XMVECTOR Q2Y = Q2;
    XMVECTOR Q2Z = Q2;
    XMVECTOR vResult = Q2;
    // Splat with one instruction
    vResult = XM_PERMUTE_PS(vResult, _MM_SHUFFLE(3, 3, 3, 3));
    Q2X = XM_PERMUTE_PS(Q2X, _MM_SHUFFLE(0, 0, 0, 0));
    Q2Y = XM_PERMUTE_PS(Q2Y, _MM_SHUFFLE(1, 1, 1, 1));
    Q2Z = XM_PERMUTE_PS(Q2Z, _MM_SHUFFLE(2, 2, 2, 2));
    // Retire Q1 and perform Q1*Q2W
    vResult = _mm_mul_ps(vResult, Q1);
    XMVECTOR Q1Shuffle = Q1;
    // Shuffle the copies of Q1
    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
    // Mul by Q1WZYX
    Q2X = _mm_mul_ps(Q2X, Q1Shuffle);
    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(2, 3, 0, 1));
    // Flip the signs on y and z
    vResult = XM_FMADD_PS(Q2X, ControlWZYX, vResult);
    // Mul by Q1ZWXY
    Q2Y = _mm_mul_ps(Q2Y, Q1Shuffle);
    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle, _MM_SHUFFLE(0, 1, 2, 3));
    // Flip the signs on z and w
    Q2Y = _mm_mul_ps(Q2Y, ControlZWXY);
    // Mul by Q1YXWZ
    Q2Z = _mm_mul_ps(Q2Z, Q1Shuffle);
    // Flip the signs on x and w
    Q2Y = XM_FMADD_PS(Q2Z, ControlYXWZ, Q2Y);
    vResult = _mm_add_ps(vResult, Q2Y);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionLengthSq(FXMVECTOR Q) noexcept
{
    return XMVector4LengthSq(Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionReciprocalLength(FXMVECTOR Q) noexcept
{
    return XMVector4ReciprocalLength(Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionLength(FXMVECTOR Q) noexcept
{
    return XMVector4Length(Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionNormalizeEst(FXMVECTOR Q) noexcept
{
    return XMVector4NormalizeEst(Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionNormalize(FXMVECTOR Q) noexcept
{
    return XMVector4Normalize(Q);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionConjugate(FXMVECTOR Q) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            -Q.vector4_f32[0],
            -Q.vector4_f32[1],
            -Q.vector4_f32[2],
            Q.vector4_f32[3]
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } };
    return vmulq_f32(Q, NegativeOne3.v);
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 NegativeOne3 = { { { -1.0f, -1.0f, -1.0f, 1.0f } } };
    return _mm_mul_ps(Q, NegativeOne3);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionInverse(FXMVECTOR Q) noexcept
{
    XMVECTOR L = XMVector4LengthSq(Q);
    XMVECTOR Conjugate = XMQuaternionConjugate(Q);

    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);

    XMVECTOR Result = XMVectorDivide(Conjugate, L);

    Result = XMVectorSelect(Result, g_XMZero, Control);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionLn(FXMVECTOR Q) noexcept
{
    static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };

    XMVECTOR QW = XMVectorSplatW(Q);
    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);

    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);

    XMVECTOR Theta = XMVectorACos(QW);
    XMVECTOR SinTheta = XMVectorSin(Theta);

    XMVECTOR S = XMVectorDivide(Theta, SinTheta);

    XMVECTOR Result = XMVectorMultiply(Q0, S);
    Result = XMVectorSelect(Q0, Result, ControlW);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionExp(FXMVECTOR Q) noexcept
{
    XMVECTOR Theta = XMVector3Length(Q);

    XMVECTOR SinTheta, CosTheta;
    XMVectorSinCos(&SinTheta, &CosTheta, Theta);

    XMVECTOR S = XMVectorDivide(SinTheta, Theta);

    XMVECTOR Result = XMVectorMultiply(Q, S);

    const XMVECTOR Zero = XMVectorZero();
    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
    Result = XMVectorSelect(Result, Q, Control);

    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionSlerp
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    float    t
) noexcept
{
    XMVECTOR T = XMVectorReplicate(t);
    return XMQuaternionSlerpV(Q0, Q1, T);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionSlerpV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR T
) noexcept
{
    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));

    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)

#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };

    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);

    const XMVECTOR Zero = XMVectorZero();
    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);

    CosOmega = XMVectorMultiply(CosOmega, Sign);

    Control = XMVectorLess(CosOmega, OneMinusEpsilon);

    XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
    SinOmega = XMVectorSqrt(SinOmega);

    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);

    XMVECTOR SignMask = XMVectorSplatSignMask();
    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
    V01 = XMVectorXorInt(V01, SignMask);
    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);

    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);

    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
    S0 = XMVectorSin(S0);
    S0 = XMVectorMultiply(S0, InvSinOmega);

    S0 = XMVectorSelect(V01, S0, Control);

    XMVECTOR S1 = XMVectorSplatY(S0);
    S0 = XMVectorSplatX(S0);

    S1 = XMVectorMultiply(S1, Sign);

    XMVECTOR Result = XMVectorMultiply(Q0, S0);
    Result = XMVectorMultiplyAdd(Q1, S1, Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 OneMinusEpsilon = { { { 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f } } };
    static const XMVECTORU32 SignMask2 = { { { 0x80000000, 0x00000000, 0x00000000, 0x00000000 } } };

    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);

    const XMVECTOR Zero = XMVectorZero();
    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);

    CosOmega = _mm_mul_ps(CosOmega, Sign);

    Control = XMVectorLess(CosOmega, OneMinusEpsilon);

    XMVECTOR SinOmega = _mm_mul_ps(CosOmega, CosOmega);
    SinOmega = _mm_sub_ps(g_XMOne, SinOmega);
    SinOmega = _mm_sqrt_ps(SinOmega);

    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);

    XMVECTOR V01 = XM_PERMUTE_PS(T, _MM_SHUFFLE(2, 3, 0, 1));
    V01 = _mm_and_ps(V01, g_XMMaskXY);
    V01 = _mm_xor_ps(V01, SignMask2);
    V01 = _mm_add_ps(g_XMIdentityR0, V01);

    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
    S0 = XMVectorSin(S0);
    S0 = _mm_div_ps(S0, SinOmega);

    S0 = XMVectorSelect(V01, S0, Control);

    XMVECTOR S1 = XMVectorSplatY(S0);
    S0 = XMVectorSplatX(S0);

    S1 = _mm_mul_ps(S1, Sign);
    XMVECTOR Result = _mm_mul_ps(Q0, S0);
    S1 = _mm_mul_ps(S1, Q1);
    Result = _mm_add_ps(Result, S1);
    return Result;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionSquad
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    GXMVECTOR Q3,
    float    t
) noexcept
{
    XMVECTOR T = XMVectorReplicate(t);
    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionSquadV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    GXMVECTOR Q3,
    HXMVECTOR T
) noexcept
{
    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));

    XMVECTOR TP = T;
    const XMVECTOR Two = XMVectorSplatConstant(2, 0);

    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);

    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
    TP = XMVectorMultiply(TP, Two);

    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);

    return Result;
}

//------------------------------------------------------------------------------

inline void XM_CALLCONV XMQuaternionSquadSetup
(
    XMVECTOR* pA,
    XMVECTOR* pB,
    XMVECTOR* pC,
    FXMVECTOR  Q0,
    FXMVECTOR  Q1,
    FXMVECTOR  Q2,
    GXMVECTOR  Q3
) noexcept
{
    assert(pA);
    assert(pB);
    assert(pC);

    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
    XMVECTOR SQ2 = XMVectorNegate(Q2);

    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
    SQ2 = XMVectorSelect(Q2, SQ2, Control1);

    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
    XMVECTOR SQ0 = XMVectorNegate(Q0);

    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
    XMVECTOR SQ3 = XMVectorNegate(Q3);

    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
    XMVECTOR Control2 = XMVectorLess(LS23, LD23);

    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
    SQ3 = XMVectorSelect(Q3, SQ3, Control2);

    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);

    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));

    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);

    XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
    XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
    ExpQ02 = XMQuaternionExp(ExpQ02);
    ExpQ13 = XMQuaternionExp(ExpQ13);

    *pA = XMQuaternionMultiply(Q1, ExpQ02);
    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
    *pC = SQ2;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentric
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    float    f,
    float    g
) noexcept
{
    float s = f + g;

    XMVECTOR Result;
    if ((s < 0.00001f) && (s > -0.00001f))
    {
        Result = Q0;
    }
    else
    {
        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);

        Result = XMQuaternionSlerp(Q01, Q02, g / s);
    }

    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionBaryCentricV
(
    FXMVECTOR Q0,
    FXMVECTOR Q1,
    FXMVECTOR Q2,
    GXMVECTOR F,
    HXMVECTOR G
) noexcept
{
    assert((XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)));
    assert((XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)));

    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);

    XMVECTOR S = XMVectorAdd(F, G);

    XMVECTOR Result;
    if (XMVector4InBounds(S, Epsilon))
    {
        Result = Q0;
    }
    else
    {
        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
        XMVECTOR GS = XMVectorReciprocal(S);
        GS = XMVectorMultiply(G, GS);

        Result = XMQuaternionSlerpV(Q01, Q02, GS);
    }

    return Result;
}

//------------------------------------------------------------------------------
// Transformation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionIdentity() noexcept
{
    return g_XMIdentityR3.v;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYaw
(
    float Pitch,
    float Yaw,
    float Roll
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    const float halfpitch = Pitch * 0.5f;
    float cp = cosf(halfpitch);
    float sp = sinf(halfpitch);

    const float halfyaw = Yaw * 0.5f;
    float cy = cosf(halfyaw);
    float sy = sinf(halfyaw);

    const float halfroll = Roll * 0.5f;
    float cr = cosf(halfroll);
    float sr = sinf(halfroll);

    XMVECTORF32 vResult = { { {
            cr * sp * cy + sr * cp * sy,
            cr * cp * sy - sr * sp * cy,
            sr * cp * cy - cr * sp * sy,
            cr * cp * cy + sr * sp * sy
        } } };
    return vResult;
#else
    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
    return XMQuaternionRotationRollPitchYawFromVector(Angles);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionRotationRollPitchYawFromVector
(
    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    const float halfpitch = Angles.vector4_f32[0] * 0.5f;
    float cp = cosf(halfpitch);
    float sp = sinf(halfpitch);

    const float halfyaw = Angles.vector4_f32[1] * 0.5f;
    float cy = cosf(halfyaw);
    float sy = sinf(halfyaw);

    const float halfroll = Angles.vector4_f32[2] * 0.5f;
    float cr = cosf(halfroll);
    float sr = sinf(halfroll);

    XMVECTORF32 vResult = { { {
            cr * sp * cy + sr * cp * sy,
            cr * cp * sy - sr * sp * cy,
            sr * cp * cy - cr * sp * sy,
            cr * cp * cy + sr * sp * sy
        } } };
    return vResult;
#else
    static const XMVECTORF32  Sign = { { { 1.0f, -1.0f, -1.0f, 1.0f } } };

    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);

    XMVECTOR SinAngles, CosAngles;
    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);

    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles);
    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles);
    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles);
    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles);

    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
    Q1 = XMVectorMultiply(Q1, Y1);
    Q0 = XMVectorMultiply(Q0, R0);
    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);

    return Q;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionRotationNormal
(
    FXMVECTOR NormalAxis,
    float    Angle
) noexcept
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);

    float SinV, CosV;
    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);

    XMVECTOR Scale = XMVectorSet(SinV, SinV, SinV, CosV);
    return XMVectorMultiply(N, Scale);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR N = _mm_and_ps(NormalAxis, g_XMMask3);
    N = _mm_or_ps(N, g_XMIdentityR3);
    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
    XMVECTOR vSine;
    XMVECTOR vCosine;
    XMVectorSinCos(&vSine, &vCosine, Scale);
    Scale = _mm_and_ps(vSine, g_XMMask3);
    vCosine = _mm_and_ps(vCosine, g_XMMaskW);
    Scale = _mm_or_ps(Scale, vCosine);
    N = _mm_mul_ps(N, Scale);
    return N;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionRotationAxis
(
    FXMVECTOR Axis,
    float    Angle
) noexcept
{
    assert(!XMVector3Equal(Axis, XMVectorZero()));
    assert(!XMVector3IsInfinite(Axis));

    XMVECTOR Normal = XMVector3Normalize(Axis);
    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
    return Q;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)

    XMVECTORF32 q;
    float r22 = M.m[2][2];
    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
    {
        float dif10 = M.m[1][1] - M.m[0][0];
        float omr22 = 1.f - r22;
        if (dif10 <= 0.f)  // x^2 >= y^2
        {
            float fourXSqr = omr22 - dif10;
            float inv4x = 0.5f / sqrtf(fourXSqr);
            q.f[0] = fourXSqr * inv4x;
            q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x;
            q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x;
            q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x;
        }
        else  // y^2 >= x^2
        {
            float fourYSqr = omr22 + dif10;
            float inv4y = 0.5f / sqrtf(fourYSqr);
            q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y;
            q.f[1] = fourYSqr * inv4y;
            q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y;
            q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y;
        }
    }
    else  // z^2 + w^2 >= x^2 + y^2
    {
        float sum10 = M.m[1][1] + M.m[0][0];
        float opr22 = 1.f + r22;
        if (sum10 <= 0.f)  // z^2 >= w^2
        {
            float fourZSqr = opr22 - sum10;
            float inv4z = 0.5f / sqrtf(fourZSqr);
            q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z;
            q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z;
            q.f[2] = fourZSqr * inv4z;
            q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z;
        }
        else  // w^2 >= z^2
        {
            float fourWSqr = opr22 + sum10;
            float inv4w = 0.5f / sqrtf(fourWSqr);
            q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w;
            q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w;
            q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w;
            q.f[3] = fourWSqr * inv4w;
        }
    }
    return q.v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)
    static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
    static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
    static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };
    static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
    static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };

    float32x4_t r0 = M.r[0];
    float32x4_t r1 = M.r[1];
    float32x4_t r2 = M.r[2];

    float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
    float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
    float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);

    // x^2 >= y^2 equivalent to r11 - r00 <= 0
    float32x4_t r11mr00 = vsubq_f32(r11, r00);
    uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);

    // z^2 >= w^2 equivalent to r11 + r00 <= 0
    float32x4_t r11pr00 = vaddq_f32(r11, r00);
    uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);

    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
    uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);

    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
    float32x4_t t0 = vmulq_f32(XMPMMP, r00);
    float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
    x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
    x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);

    // (r01, r02, r12, r11)
    t0 = vextq_f32(r0, r0, 1);
    float32x4_t t1 = vextq_f32(r1, r1, 1);
    t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));

    // (r10, r20, r21, r10)
    t1 = vextq_f32(r2, r2, 3);
    float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
    t1 = vbslq_f32(Select0110, t1, r10);

    // (4*x*y, 4*x*z, 4*y*z, unused)
    float32x4_t xyxzyz = vaddq_f32(t0, t1);

    // (r21, r20, r10, r10)
    t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));

    // (r12, r02, r01, r12)
    float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
    float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
    t1 = vbslq_f32(Select0110, t2, t3);

    // (4*x*w, 4*y*w, 4*z*w, unused)
    float32x4_t xwywzw = vsubq_f32(t0, t1);
    xwywzw = vmulq_f32(XMMPMP, xwywzw);

    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
    t0 = vextq_f32(xyxzyz, xyxzyz, 3);
    t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
    t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
    float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);

    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
    t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
    t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
    float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);

    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
    t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
    float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);

    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
    float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);

    // Select the row of the tensor-product matrix that has the largest
    // magnitude.
    t0 = vbslq_f32(x2gey2, tensor0, tensor1);
    t1 = vbslq_f32(z2gew2, tensor2, tensor3);
    t2 = vbslq_f32(x2py2gez2pw2, t0, t1);

    // Normalize the row.  No division by zero is possible because the
    // quaternion is unit-length (and the row is a nonzero multiple of
    // the quaternion).
    t0 = XMVector4Length(t2);
    return XMVectorDivide(t2, t0);
#elif defined(_XM_SSE_INTRINSICS_)
    static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
    static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
    static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };

    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)

    // (r00, r00, r00, r00)
    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0));
    // (r11, r11, r11, r11)
    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1));
    // (r22, r22, r22, r22)
    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2));

    // x^2 >= y^2 equivalent to r11 - r00 <= 0
    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);

    // z^2 >= w^2 equivalent to r11 + r00 <= 0
    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);

    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);

    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
    XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne);
    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
    XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0);
    XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2);

    // (r01, r02, r12, r11)
    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1));
    // (r10, r10, r20, r21)
    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0));
    // (r10, r20, r21, r10)
    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
    // (4*x*y, 4*x*z, 4*y*z, unused)
    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);

    // (r21, r20, r10, r10)
    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1));
    // (r12, r12, r02, r01)
    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2));
    // (r12, r02, r01, r12)
    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
    // (4*x*w, 4*y*w, 4*z*w, unused)
    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);

    // (4*x^2, 4*y^2, 4*x*y, unused)
    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0));
    // (4*z^2, 4*w^2, 4*z*w, unused)
    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2));
    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1));

    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2));
    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0));
    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2));

    // Select the row of the tensor-product matrix that has the largest
    // magnitude.
    t0 = _mm_and_ps(x2gey2, tensor0);
    t1 = _mm_andnot_ps(x2gey2, tensor1);
    t0 = _mm_or_ps(t0, t1);
    t1 = _mm_and_ps(z2gew2, tensor2);
    t2 = _mm_andnot_ps(z2gew2, tensor3);
    t1 = _mm_or_ps(t1, t2);
    t0 = _mm_and_ps(x2py2gez2pw2, t0);
    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
    t2 = _mm_or_ps(t0, t1);

    // Normalize the row.  No division by zero is possible because the
    // quaternion is unit-length (and the row is a nonzero multiple of
    // the quaternion).
    t0 = XMVector4Length(t2);
    return _mm_div_ps(t2, t0);
#endif
}

//------------------------------------------------------------------------------
// Conversion operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline void XM_CALLCONV XMQuaternionToAxisAngle
(
    XMVECTOR* pAxis,
    float* pAngle,
    FXMVECTOR  Q
) noexcept
{
    assert(pAxis);
    assert(pAngle);

    *pAxis = Q;

    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
}

/****************************************************************************
 *
 * Plane
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Comparison operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------

inline bool XM_CALLCONV XMPlaneEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2
) noexcept
{
    return XMVector4Equal(P1, P2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMPlaneNearEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2,
    FXMVECTOR Epsilon
) noexcept
{
    XMVECTOR NP1 = XMPlaneNormalize(P1);
    XMVECTOR NP2 = XMPlaneNormalize(P2);
    return XMVector4NearEqual(NP1, NP2, Epsilon);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMPlaneNotEqual
(
    FXMVECTOR P1,
    FXMVECTOR P2
) noexcept
{
    return XMVector4NotEqual(P1, P2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMPlaneIsNaN(FXMVECTOR P) noexcept
{
    return XMVector4IsNaN(P);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMPlaneIsInfinite(FXMVECTOR P) noexcept
{
    return XMVector4IsInfinite(P);
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneDot
(
    FXMVECTOR P,
    FXMVECTOR V
) noexcept
{
    return XMVector4Dot(P, V);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneDotCoord
(
    FXMVECTOR P,
    FXMVECTOR V
) noexcept
{
    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]

    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
    XMVECTOR Result = XMVector4Dot(P, V3);
    return Result;
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneDotNormal
(
    FXMVECTOR P,
    FXMVECTOR V
) noexcept
{
    return XMVector3Dot(P, V);
}

//------------------------------------------------------------------------------
// XMPlaneNormalizeEst uses a reciprocal estimate and
// returns QNaN on zero and infinite vectors.

inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst(FXMVECTOR P) noexcept
{
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
    return XMVectorMultiply(P, Result);

#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vTemp = _mm_dp_ps(P, P, 0x7f);
    XMVECTOR vResult = _mm_rsqrt_ps(vTemp);
    return _mm_mul_ps(vResult, P);
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product
    XMVECTOR vDot = _mm_mul_ps(P, P);
    // x=Dot.y, y=Dot.z
    XMVECTOR vTemp = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(2, 1, 2, 1));
    // Result.x = x+y
    vDot = _mm_add_ss(vDot, vTemp);
    // x=Dot.z
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    // Result.x = (x+y)+z
    vDot = _mm_add_ss(vDot, vTemp);
    // Splat x
    vDot = XM_PERMUTE_PS(vDot, _MM_SHUFFLE(0, 0, 0, 0));
    // Get the reciprocal
    vDot = _mm_rsqrt_ps(vDot);
    // Get the reciprocal
    vDot = _mm_mul_ps(vDot, P);
    return vDot;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneNormalize(FXMVECTOR P) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    float fLengthSq = sqrtf((P.vector4_f32[0] * P.vector4_f32[0]) + (P.vector4_f32[1] * P.vector4_f32[1]) + (P.vector4_f32[2] * P.vector4_f32[2]));
    // Prevent divide by zero
    if (fLengthSq > 0)
    {
        fLengthSq = 1.0f / fLengthSq;
    }
    XMVECTORF32 vResult = { { {
            P.vector4_f32[0] * fLengthSq,
            P.vector4_f32[1] * fLengthSq,
            P.vector4_f32[2] * fLengthSq,
            P.vector4_f32[3] * fLengthSq
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR vLength = XMVector3ReciprocalLength(P);
    return XMVectorMultiply(P, vLength);
#elif defined(_XM_SSE4_INTRINSICS_)
    XMVECTOR vLengthSq = _mm_dp_ps(P, P, 0x7f);
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(P, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vLengthSq);
    return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
    // Perform the dot product on x,y and z only
    XMVECTOR vLengthSq = _mm_mul_ps(P, P);
    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
    vLengthSq = _mm_add_ss(vLengthSq, vTemp);
    vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
    // Prepare for the division
    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
    // Failsafe on zero (Or epsilon) length planes
    // If the length is infinity, set the elements to zero
    vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
    // Reciprocal mul to perform the normalization
    vResult = _mm_div_ps(P, vResult);
    // Any that are infinity, set to zero
    vResult = _mm_and_ps(vResult, vLengthSq);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneIntersectLine
(
    FXMVECTOR P,
    FXMVECTOR LinePoint1,
    FXMVECTOR LinePoint2
) noexcept
{
    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
    XMVECTOR D = XMVectorSubtract(V1, V2);

    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
    VT = XMVectorDivide(VT, D);

    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);

    const XMVECTOR Zero = XMVectorZero();
    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);

    return XMVectorSelect(Point, g_XMQNaN.v, Control);
}

//------------------------------------------------------------------------------

inline void XM_CALLCONV XMPlaneIntersectPlane
(
    XMVECTOR* pLinePoint1,
    XMVECTOR* pLinePoint2,
    FXMVECTOR  P1,
    FXMVECTOR  P2
) noexcept
{
    assert(pLinePoint1);
    assert(pLinePoint2);

    XMVECTOR V1 = XMVector3Cross(P2, P1);

    XMVECTOR LengthSq = XMVector3LengthSq(V1);

    XMVECTOR V2 = XMVector3Cross(P2, V1);

    XMVECTOR P1W = XMVectorSplatW(P1);
    XMVECTOR Point = XMVectorMultiply(V2, P1W);

    XMVECTOR V3 = XMVector3Cross(V1, P1);

    XMVECTOR P2W = XMVectorSplatW(P2);
    Point = XMVectorMultiplyAdd(V3, P2W, Point);

    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);

    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);

    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
    *pLinePoint1 = XMVectorSelect(LinePoint1, g_XMQNaN.v, Control);
    *pLinePoint2 = XMVectorSelect(LinePoint2, g_XMQNaN.v, Control);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneTransform
(
    FXMVECTOR P,
    FXMMATRIX ITM
) noexcept
{
    XMVECTOR W = XMVectorSplatW(P);
    XMVECTOR Z = XMVectorSplatZ(P);
    XMVECTOR Y = XMVectorSplatY(P);
    XMVECTOR X = XMVectorSplatX(P);

    XMVECTOR Result = XMVectorMultiply(W, ITM.r[3]);
    Result = XMVectorMultiplyAdd(Z, ITM.r[2], Result);
    Result = XMVectorMultiplyAdd(Y, ITM.r[1], Result);
    Result = XMVectorMultiplyAdd(X, ITM.r[0], Result);
    return Result;
}

//------------------------------------------------------------------------------

inline XMFLOAT4* XM_CALLCONV XMPlaneTransformStream
(
    XMFLOAT4*       pOutputStream,
    size_t          OutputStride,
    const XMFLOAT4* pInputStream,
    size_t          InputStride,
    size_t          PlaneCount,
    FXMMATRIX       ITM
) noexcept
{
    return XMVector4TransformStream(pOutputStream,
        OutputStride,
        pInputStream,
        InputStride,
        PlaneCount,
        ITM);
}

//------------------------------------------------------------------------------
// Conversion operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneFromPointNormal
(
    FXMVECTOR Point,
    FXMVECTOR Normal
) noexcept
{
    XMVECTOR W = XMVector3Dot(Point, Normal);
    W = XMVectorNegate(W);
    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMPlaneFromPoints
(
    FXMVECTOR Point1,
    FXMVECTOR Point2,
    FXMVECTOR Point3
) noexcept
{
    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);

    XMVECTOR N = XMVector3Cross(V21, V31);
    N = XMVector3Normalize(N);

    XMVECTOR D = XMPlaneDotNormal(N, Point1);
    D = XMVectorNegate(D);

    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);

    return Result;
}

/****************************************************************************
 *
 * Color
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------
 // Comparison operations
 //------------------------------------------------------------------------------

 //------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4Equal(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorNotEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4NotEqual(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorGreater
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4Greater(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorGreaterOrEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4GreaterOrEqual(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorLess
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4Less(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorLessOrEqual
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVector4LessOrEqual(C1, C2);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorIsNaN(FXMVECTOR C) noexcept
{
    return XMVector4IsNaN(C);
}

//------------------------------------------------------------------------------

inline bool XM_CALLCONV XMColorIsInfinite(FXMVECTOR C) noexcept
{
    return XMVector4IsInfinite(C);
}

//------------------------------------------------------------------------------
// Computation operations
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorNegative(FXMVECTOR vColor) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { {
            1.0f - vColor.vector4_f32[0],
            1.0f - vColor.vector4_f32[1],
            1.0f - vColor.vector4_f32[2],
            vColor.vector4_f32[3]
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    uint32x4_t vTemp = veorq_u32(vreinterpretq_u32_f32(vColor), g_XMNegate3);
    return vaddq_f32(vreinterpretq_f32_u32(vTemp), g_XMOne3);
#elif defined(_XM_SSE_INTRINSICS_)
    // Negate only x,y and z.
    XMVECTOR vTemp = _mm_xor_ps(vColor, g_XMNegate3);
    // Add 1,1,1,0 to -x,-y,-z,w
    return _mm_add_ps(vTemp, g_XMOne3);
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorModulate
(
    FXMVECTOR C1,
    FXMVECTOR C2
) noexcept
{
    return XMVectorMultiply(C1, C2);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorAdjustSaturation
(
    FXMVECTOR vColor,
    float    fSaturation
) noexcept
{
    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
    // Result = (C - Luminance) * Saturation + Luminance;

    const XMVECTORF32 gvLuminance = { { { 0.2125f, 0.7154f, 0.0721f, 0.0f } } };
#if defined(_XM_NO_INTRINSICS_)
    float fLuminance = (vColor.vector4_f32[0] * gvLuminance.f[0]) + (vColor.vector4_f32[1] * gvLuminance.f[1]) + (vColor.vector4_f32[2] * gvLuminance.f[2]);
    XMVECTOR vResult;
    vResult.vector4_f32[0] = ((vColor.vector4_f32[0] - fLuminance) * fSaturation) + fLuminance;
    vResult.vector4_f32[1] = ((vColor.vector4_f32[1] - fLuminance) * fSaturation) + fLuminance;
    vResult.vector4_f32[2] = ((vColor.vector4_f32[2] - fLuminance) * fSaturation) + fLuminance;
    vResult.vector4_f32[3] = vColor.vector4_f32[3];
    return vResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
    vResult = vmlaq_n_f32(vLuminance, vResult, fSaturation);
    return vbslq_f32(g_XMSelect1110, vResult, vColor);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vLuminance = XMVector3Dot(vColor, gvLuminance);
    // Splat fSaturation
    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
    // vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
    XMVECTOR vResult = _mm_sub_ps(vColor, vLuminance);
    vResult = XM_FMADD_PS(vResult, vSaturation, vLuminance);
    // Retain w from the source color
    vLuminance = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
    vResult = _mm_shuffle_ps(vResult, vLuminance, _MM_SHUFFLE(3, 0, 1, 0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorAdjustContrast
(
    FXMVECTOR vColor,
    float    fContrast
) noexcept
{
    // Result = (vColor - 0.5f) * fContrast + 0.5f;

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 vResult = { { {
            ((vColor.vector4_f32[0] - 0.5f) * fContrast) + 0.5f,
            ((vColor.vector4_f32[1] - 0.5f) * fContrast) + 0.5f,
            ((vColor.vector4_f32[2] - 0.5f) * fContrast) + 0.5f,
            vColor.vector4_f32[3]        // Leave W untouched
        } } };
    return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
    vResult = vmlaq_n_f32(g_XMOneHalf.v, vResult, fContrast);
    return vbslq_f32(g_XMSelect1110, vResult, vColor);
#elif defined(_XM_SSE_INTRINSICS_)
    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
    XMVECTOR vResult = _mm_sub_ps(vColor, g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
    vResult = XM_FMADD_PS(vResult, vScale, g_XMOneHalf);
// Retain w from the source color
    vScale = _mm_shuffle_ps(vResult, vColor, _MM_SHUFFLE(3, 2, 2, 2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
    vResult = _mm_shuffle_ps(vResult, vScale, _MM_SHUFFLE(3, 0, 1, 0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToHSL(FXMVECTOR rgb) noexcept
{
    XMVECTOR r = XMVectorSplatX(rgb);
    XMVECTOR g = XMVectorSplatY(rgb);
    XMVECTOR b = XMVectorSplatZ(rgb);

    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
    XMVECTOR max = XMVectorMax(r, XMVectorMax(g, b));

    XMVECTOR l = XMVectorMultiply(XMVectorAdd(min, max), g_XMOneHalf);

    XMVECTOR d = XMVectorSubtract(max, min);

    XMVECTOR la = XMVectorSelect(rgb, l, g_XMSelect1110);

    if (XMVector3Less(d, g_XMEpsilon))
    {
        // Achromatic, assume H and S of 0
        return XMVectorSelect(la, g_XMZero, g_XMSelect1100);
    }
    else
    {
        XMVECTOR s, h;

        XMVECTOR d2 = XMVectorAdd(min, max);

        if (XMVector3Greater(l, g_XMOneHalf))
        {
            // d / (2-max-min)
            s = XMVectorDivide(d, XMVectorSubtract(g_XMTwo, d2));
        }
        else
        {
            // d / (max+min)
            s = XMVectorDivide(d, d2);
        }

        if (XMVector3Equal(r, max))
        {
            // Red is max
            h = XMVectorDivide(XMVectorSubtract(g, b), d);
        }
        else if (XMVector3Equal(g, max))
        {
            // Green is max
            h = XMVectorDivide(XMVectorSubtract(b, r), d);
            h = XMVectorAdd(h, g_XMTwo);
        }
        else
        {
            // Blue is max
            h = XMVectorDivide(XMVectorSubtract(r, g), d);
            h = XMVectorAdd(h, g_XMFour);
        }

        h = XMVectorDivide(h, g_XMSix);

        if (XMVector3Less(h, g_XMZero))
            h = XMVectorAdd(h, g_XMOne);

        XMVECTOR lha = XMVectorSelect(la, h, g_XMSelect1100);
        return XMVectorSelect(s, lha, g_XMSelect1011);
    }
}

//------------------------------------------------------------------------------

namespace Internal
{

    inline XMVECTOR XM_CALLCONV XMColorHue2Clr(FXMVECTOR p, FXMVECTOR q, FXMVECTOR h) noexcept
    {
        static const XMVECTORF32 oneSixth = { { { 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f, 1.0f / 6.0f } } };
        static const XMVECTORF32 twoThirds = { { { 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f } } };

        XMVECTOR t = h;

        if (XMVector3Less(t, g_XMZero))
            t = XMVectorAdd(t, g_XMOne);

        if (XMVector3Greater(t, g_XMOne))
            t = XMVectorSubtract(t, g_XMOne);

        if (XMVector3Less(t, oneSixth))
        {
            // p + (q - p) * 6 * t
            XMVECTOR t1 = XMVectorSubtract(q, p);
            XMVECTOR t2 = XMVectorMultiply(g_XMSix, t);
            return XMVectorMultiplyAdd(t1, t2, p);
        }

        if (XMVector3Less(t, g_XMOneHalf))
            return q;

        if (XMVector3Less(t, twoThirds))
        {
            // p + (q - p) * 6 * (2/3 - t)
            XMVECTOR t1 = XMVectorSubtract(q, p);
            XMVECTOR t2 = XMVectorMultiply(g_XMSix, XMVectorSubtract(twoThirds, t));
            return XMVectorMultiplyAdd(t1, t2, p);
        }

        return p;
    }

} // namespace Internal

inline XMVECTOR XM_CALLCONV XMColorHSLToRGB(FXMVECTOR hsl) noexcept
{
    static const XMVECTORF32 oneThird = { { { 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f } } };

    XMVECTOR s = XMVectorSplatY(hsl);
    XMVECTOR l = XMVectorSplatZ(hsl);

    if (XMVector3NearEqual(s, g_XMZero, g_XMEpsilon))
    {
        // Achromatic
        return XMVectorSelect(hsl, l, g_XMSelect1110);
    }
    else
    {
        XMVECTOR h = XMVectorSplatX(hsl);

        XMVECTOR q;
        if (XMVector3Less(l, g_XMOneHalf))
        {
            q = XMVectorMultiply(l, XMVectorAdd(g_XMOne, s));
        }
        else
        {
            q = XMVectorSubtract(XMVectorAdd(l, s), XMVectorMultiply(l, s));
        }

        XMVECTOR p = XMVectorSubtract(XMVectorMultiply(g_XMTwo, l), q);

        XMVECTOR r = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorAdd(h, oneThird));
        XMVECTOR g = DirectX::Internal::XMColorHue2Clr(p, q, h);
        XMVECTOR b = DirectX::Internal::XMColorHue2Clr(p, q, XMVectorSubtract(h, oneThird));

        XMVECTOR rg = XMVectorSelect(g, r, g_XMSelect1000);
        XMVECTOR ba = XMVectorSelect(hsl, b, g_XMSelect1110);

        return XMVectorSelect(ba, rg, g_XMSelect1100);
    }
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToHSV(FXMVECTOR rgb) noexcept
{
    XMVECTOR r = XMVectorSplatX(rgb);
    XMVECTOR g = XMVectorSplatY(rgb);
    XMVECTOR b = XMVectorSplatZ(rgb);

    XMVECTOR min = XMVectorMin(r, XMVectorMin(g, b));
    XMVECTOR v = XMVectorMax(r, XMVectorMax(g, b));

    XMVECTOR d = XMVectorSubtract(v, min);

    XMVECTOR s = (XMVector3NearEqual(v, g_XMZero, g_XMEpsilon)) ? g_XMZero : XMVectorDivide(d, v);

    if (XMVector3Less(d, g_XMEpsilon))
    {
        // Achromatic, assume H of 0
        XMVECTOR hv = XMVectorSelect(v, g_XMZero, g_XMSelect1000);
        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
        return XMVectorSelect(s, hva, g_XMSelect1011);
    }
    else
    {
        XMVECTOR h;

        if (XMVector3Equal(r, v))
        {
            // Red is max
            h = XMVectorDivide(XMVectorSubtract(g, b), d);

            if (XMVector3Less(g, b))
                h = XMVectorAdd(h, g_XMSix);
        }
        else if (XMVector3Equal(g, v))
        {
            // Green is max
            h = XMVectorDivide(XMVectorSubtract(b, r), d);
            h = XMVectorAdd(h, g_XMTwo);
        }
        else
        {
            // Blue is max
            h = XMVectorDivide(XMVectorSubtract(r, g), d);
            h = XMVectorAdd(h, g_XMFour);
        }

        h = XMVectorDivide(h, g_XMSix);

        XMVECTOR hv = XMVectorSelect(v, h, g_XMSelect1000);
        XMVECTOR hva = XMVectorSelect(rgb, hv, g_XMSelect1110);
        return XMVectorSelect(s, hva, g_XMSelect1011);
    }
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorHSVToRGB(FXMVECTOR hsv) noexcept
{
    XMVECTOR h = XMVectorSplatX(hsv);
    XMVECTOR s = XMVectorSplatY(hsv);
    XMVECTOR v = XMVectorSplatZ(hsv);

    XMVECTOR h6 = XMVectorMultiply(h, g_XMSix);

    XMVECTOR i = XMVectorFloor(h6);
    XMVECTOR f = XMVectorSubtract(h6, i);

    // p = v* (1-s)
    XMVECTOR p = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, s));

    // q = v*(1-f*s)
    XMVECTOR q = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(f, s)));

    // t = v*(1 - (1-f)*s)
    XMVECTOR t = XMVectorMultiply(v, XMVectorSubtract(g_XMOne, XMVectorMultiply(XMVectorSubtract(g_XMOne, f), s)));

    auto ii = static_cast<int>(XMVectorGetX(XMVectorMod(i, g_XMSix)));

    XMVECTOR _rgb;

    switch (ii)
    {
    case 0: // rgb = vtp
    {
        XMVECTOR vt = XMVectorSelect(t, v, g_XMSelect1000);
        _rgb = XMVectorSelect(p, vt, g_XMSelect1100);
    }
    break;
    case 1: // rgb = qvp
    {
        XMVECTOR qv = XMVectorSelect(v, q, g_XMSelect1000);
        _rgb = XMVectorSelect(p, qv, g_XMSelect1100);
    }
    break;
    case 2: // rgb = pvt
    {
        XMVECTOR pv = XMVectorSelect(v, p, g_XMSelect1000);
        _rgb = XMVectorSelect(t, pv, g_XMSelect1100);
    }
    break;
    case 3: // rgb = pqv
    {
        XMVECTOR pq = XMVectorSelect(q, p, g_XMSelect1000);
        _rgb = XMVectorSelect(v, pq, g_XMSelect1100);
    }
    break;
    case 4: // rgb = tpv
    {
        XMVECTOR tp = XMVectorSelect(p, t, g_XMSelect1000);
        _rgb = XMVectorSelect(v, tp, g_XMSelect1100);
    }
    break;
    default: // rgb = vpq
    {
        XMVECTOR vp = XMVectorSelect(p, v, g_XMSelect1000);
        _rgb = XMVectorSelect(q, vp, g_XMSelect1100);
    }
    break;
    }

    return XMVectorSelect(hsv, _rgb, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToYUV(FXMVECTOR rgb) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 0.299f, -0.147f, 0.615f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { 0.587f, -0.289f, -0.515f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 0.114f, 0.436f, -0.100f, 0.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(rgb, M);

    return XMVectorSelect(rgb, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorYUVToRGB(FXMVECTOR yuv) noexcept
{
    static const XMVECTORF32 Scale1 = { { { 0.0f, -0.395f, 2.032f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 1.140f, -0.581f, 0.0f, 0.0f } } };

    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(yuv, M);

    return XMVectorSelect(yuv, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_HD(FXMVECTOR rgb) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 0.2126f, -0.0997f, 0.6150f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { 0.7152f, -0.3354f, -0.5586f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 0.0722f, 0.4351f, -0.0564f, 0.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(rgb, M);

    return XMVectorSelect(rgb, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_HD(FXMVECTOR yuv) noexcept
{
    static const XMVECTORF32 Scale1 = { { { 0.0f, -0.2153f, 2.1324f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 1.2803f, -0.3806f, 0.0f, 0.0f } } };

    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(yuv, M);

    return XMVectorSelect(yuv, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToYUV_UHD(FXMVECTOR rgb) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 0.2627f, -0.1215f,  0.6150f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { 0.6780f, -0.3136f, -0.5655f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 0.0593f,  0.4351f, -0.0495f, 0.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(rgb, M);

    return XMVectorSelect(rgb, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorYUVToRGB_UHD(FXMVECTOR yuv) noexcept
{
    static const XMVECTORF32 Scale1 = { { {    0.0f, -0.1891f, 2.1620f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 1.1989f, -0.4645f,    0.0f, 0.0f } } };

    XMMATRIX M(g_XMOne, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(yuv, M);

    return XMVectorSelect(yuv, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToXYZ(FXMVECTOR rgb) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f } } };
    static const XMVECTORF32 Scale = { { { 1.f / 0.17697f, 1.f / 0.17697f, 1.f / 0.17697f, 0.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVectorMultiply(XMVector3Transform(rgb, M), Scale);

    return XMVectorSelect(rgb, clr, g_XMSelect1110);
}

inline XMVECTOR XM_CALLCONV XMColorXYZToRGB(FXMVECTOR xyz) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f } } };
    static const XMVECTORF32 Scale = { { { 0.17697f, 0.17697f, 0.17697f, 0.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(XMVectorMultiply(xyz, Scale), M);

    return XMVectorSelect(xyz, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorXYZToSRGB(FXMVECTOR xyz) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 3.2406f, -0.9689f, 0.0557f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { -1.5372f, 1.8758f, -0.2040f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { -0.4986f, 0.0415f, 1.0570f, 0.0f } } };
    static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f } } };
    static const XMVECTORF32 Exp = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.0f } } };

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR lclr = XMVector3Transform(xyz, M);

    XMVECTOR sel = XMVectorGreater(lclr, Cutoff);

    // clr = 12.92 * lclr for lclr <= 0.0031308f
    XMVECTOR smallC = XMVectorMultiply(lclr, g_XMsrgbScale);

    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
    XMVECTOR largeC = XMVectorSubtract(XMVectorMultiply(g_XMsrgbA1, XMVectorPow(lclr, Exp)), g_XMsrgbA);

    XMVECTOR clr = XMVectorSelect(smallC, largeC, sel);

    return XMVectorSelect(xyz, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorSRGBToXYZ(FXMVECTOR srgb) noexcept
{
    static const XMVECTORF32 Scale0 = { { { 0.4124f, 0.2126f, 0.0193f, 0.0f } } };
    static const XMVECTORF32 Scale1 = { { { 0.3576f, 0.7152f, 0.1192f, 0.0f } } };
    static const XMVECTORF32 Scale2 = { { { 0.1805f, 0.0722f, 0.9505f, 0.0f } } };
    static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 0.0f } } };
    static const XMVECTORF32 Exp = { { { 2.4f, 2.4f, 2.4f, 1.0f } } };

    XMVECTOR sel = XMVectorGreater(srgb, Cutoff);

    // lclr = clr / 12.92
    XMVECTOR smallC = XMVectorDivide(srgb, g_XMsrgbScale);

    // lclr = pow( (clr + a) / (1+a), 2.4 )
    XMVECTOR largeC = XMVectorPow(XMVectorDivide(XMVectorAdd(srgb, g_XMsrgbA), g_XMsrgbA1), Exp);

    XMVECTOR lclr = XMVectorSelect(smallC, largeC, sel);

    XMMATRIX M(Scale0, Scale1, Scale2, g_XMZero);
    XMVECTOR clr = XMVector3Transform(lclr, M);

    return XMVectorSelect(srgb, clr, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorRGBToSRGB(FXMVECTOR rgb) noexcept
{
    static const XMVECTORF32 Cutoff = { { { 0.0031308f, 0.0031308f, 0.0031308f, 1.f } } };
    static const XMVECTORF32 Linear = { { { 12.92f, 12.92f, 12.92f, 1.f } } };
    static const XMVECTORF32 Scale = { { { 1.055f, 1.055f, 1.055f, 1.f } } };
    static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } };
    static const XMVECTORF32 InvGamma = { { { 1.0f / 2.4f, 1.0f / 2.4f, 1.0f / 2.4f, 1.f } } };

    XMVECTOR V = XMVectorSaturate(rgb);
    XMVECTOR V0 = XMVectorMultiply(V, Linear);
    XMVECTOR V1 = XMVectorSubtract(XMVectorMultiply(Scale, XMVectorPow(V, InvGamma)), Bias);
    XMVECTOR select = XMVectorLess(V, Cutoff);
    V = XMVectorSelect(V1, V0, select);
    return XMVectorSelect(rgb, V, g_XMSelect1110);
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMColorSRGBToRGB(FXMVECTOR srgb) noexcept
{
    static const XMVECTORF32 Cutoff = { { { 0.04045f, 0.04045f, 0.04045f, 1.f } } };
    static const XMVECTORF32 ILinear = { { { 1.f / 12.92f, 1.f / 12.92f, 1.f / 12.92f, 1.f } } };
    static const XMVECTORF32 Scale = { { { 1.f / 1.055f, 1.f / 1.055f, 1.f / 1.055f, 1.f } } };
    static const XMVECTORF32 Bias = { { { 0.055f, 0.055f, 0.055f, 0.f } } };
    static const XMVECTORF32 Gamma = { { { 2.4f, 2.4f, 2.4f, 1.f } } };

    XMVECTOR V = XMVectorSaturate(srgb);
    XMVECTOR V0 = XMVectorMultiply(V, ILinear);
    XMVECTOR V1 = XMVectorPow(XMVectorMultiply(XMVectorAdd(V, Bias), Scale), Gamma);
    XMVECTOR select = XMVectorGreater(V, Cutoff);
    V = XMVectorSelect(V0, V1, select);
    return XMVectorSelect(srgb, V, g_XMSelect1110);
}

/****************************************************************************
 *
 * Miscellaneous
 *
 ****************************************************************************/

 //------------------------------------------------------------------------------

inline bool XMVerifyCPUSupport() noexcept
{
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    int CPUInfo[4] = { -1 };
#if defined(__clang__) || defined(__GNUC__)
    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
    __cpuid(CPUInfo, 0);
#endif

#ifdef __AVX2__
    if (CPUInfo[0] < 7)
        return false;
#else
    if (CPUInfo[0] < 1)
        return false;
#endif

#if defined(__clang__) || defined(__GNUC__)
    __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
    __cpuid(CPUInfo, 1);
#endif

#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
    // The compiler can emit FMA3 instructions even without explicit intrinsics use
    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
    if ((CPUInfo[2] & 0x38081001) != 0x38081001)
        return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_)
    if ((CPUInfo[2] & 0x18081001) != 0x18081001)
        return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_F16C_INTRINSICS_)
    if ((CPUInfo[2] & 0x38080001) != 0x38080001)
        return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
#elif defined(__AVX__) || defined(_XM_AVX_INTRINSICS_)
    if ((CPUInfo[2] & 0x18080001) != 0x18080001)
        return false; // No AVX/OSXSAVE/SSE4.1/SSE3 support
#elif defined(_XM_SSE4_INTRINSICS_)
    if ((CPUInfo[2] & 0x80001) != 0x80001)
        return false; // No SSE3/SSE4.1 support
#elif defined(_XM_SSE3_INTRINSICS_)
    if (!(CPUInfo[2] & 0x1))
        return false; // No SSE3 support
#endif

    // The x64 processor model requires SSE2 support, but no harm in checking
    if ((CPUInfo[3] & 0x6000000) != 0x6000000)
        return false; // No SSE2/SSE support

#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
#if defined(__clang__) || defined(__GNUC__)
    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
    __cpuidex(CPUInfo, 7, 0);
#endif
    if (!(CPUInfo[1] & 0x20))
        return false; // No AVX2 support
#endif

    return true;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
    // ARM-NEON support is required for the Windows on ARM platform
    return true;
#else
    // No intrinsics path always supported
    return true;
#endif
}

//------------------------------------------------------------------------------

inline XMVECTOR XM_CALLCONV XMFresnelTerm
(
    FXMVECTOR CosIncidentAngle,
    FXMVECTOR RefractionIndex
) noexcept
{
    assert(!XMVector4IsInfinite(CosIncidentAngle));

    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
    // c = CosIncidentAngle
    // g = sqrt(c^2 + RefractionIndex^2 - 1)

#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)

    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
    G = XMVectorAbs(G);
    G = XMVectorSqrt(G);

    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);

    XMVECTOR V0 = XMVectorMultiply(D, D);
    XMVECTOR V1 = XMVectorMultiply(S, S);
    V1 = XMVectorReciprocal(V1);
    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
    V0 = XMVectorMultiply(V0, V1);

    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
    V2 = XMVectorMultiply(V2, V2);
    V3 = XMVectorMultiply(V3, V3);
    V3 = XMVectorReciprocal(V3);
    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);

    XMVECTOR Result = XMVectorMultiply(V0, V2);

    Result = XMVectorSaturate(Result);

    return Result;

#elif defined(_XM_SSE_INTRINSICS_)
    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
    XMVECTOR G = _mm_mul_ps(RefractionIndex, RefractionIndex);
    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle, CosIncidentAngle);
    G = _mm_sub_ps(G, g_XMOne);
    vTemp = _mm_add_ps(vTemp, G);
    // max((0-vTemp),vTemp) == abs(vTemp)
    // The abs is needed to deal with refraction and cosine being zero
    G = _mm_setzero_ps();
    G = _mm_sub_ps(G, vTemp);
    G = _mm_max_ps(G, vTemp);
    // Last operation, the sqrt()
    G = _mm_sqrt_ps(G);

    // Calc G-C and G+C
    XMVECTOR GAddC = _mm_add_ps(G, CosIncidentAngle);
    XMVECTOR GSubC = _mm_sub_ps(G, CosIncidentAngle);
    // Perform the term (0.5f *(g - c)^2) / (g + c)^2
    XMVECTOR vResult = _mm_mul_ps(GSubC, GSubC);
    vTemp = _mm_mul_ps(GAddC, GAddC);
    vResult = _mm_mul_ps(vResult, g_XMOneHalf);
    vResult = _mm_div_ps(vResult, vTemp);
    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
    GAddC = _mm_mul_ps(GAddC, CosIncidentAngle);
    GSubC = _mm_mul_ps(GSubC, CosIncidentAngle);
    GAddC = _mm_sub_ps(GAddC, g_XMOne);
    GSubC = _mm_add_ps(GSubC, g_XMOne);
    GAddC = _mm_mul_ps(GAddC, GAddC);
    GSubC = _mm_mul_ps(GSubC, GSubC);
    GAddC = _mm_div_ps(GAddC, GSubC);
    GAddC = _mm_add_ps(GAddC, g_XMOne);
    // Multiply the two term parts
    vResult = _mm_mul_ps(vResult, GAddC);
    // Clamp to 0.0 - 1.0f
    vResult = _mm_max_ps(vResult, g_XMZero);
    vResult = _mm_min_ps(vResult, g_XMOne);
    return vResult;
#endif
}

//------------------------------------------------------------------------------

inline bool XMScalarNearEqual
(
    float S1,
    float S2,
    float Epsilon
) noexcept
{
    float Delta = S1 - S2;
    return (fabsf(Delta) <= Epsilon);
}

//------------------------------------------------------------------------------
// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
inline float XMScalarModAngle(float Angle) noexcept
{
    // Note: The modulo is performed with unsigned math only to work
    // around a precision error on numbers that are close to PI

    // Normalize the range from 0.0f to XM_2PI
    Angle = Angle + XM_PI;
    // Perform the modulo, unsigned
    float fTemp = fabsf(Angle);
    fTemp = fTemp - (XM_2PI * static_cast<float>(static_cast<int32_t>(fTemp / XM_2PI)));
    // Restore the number to the range of -XM_PI to XM_PI-epsilon
    fTemp = fTemp - XM_PI;
    // If the modulo'd value was negative, restore negation
    if (Angle < 0.0f)
    {
        fTemp = -fTemp;
    }
    return fTemp;
}

//------------------------------------------------------------------------------

inline float XMScalarSin(float Value) noexcept
{
    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
    }

    // 11-degree minimax approximation
    float y2 = y * y;
    return (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
}

//------------------------------------------------------------------------------

inline float XMScalarSinEst(float Value) noexcept
{
    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
    }

    // 7-degree minimax approximation
    float y2 = y * y;
    return (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y;
}

//------------------------------------------------------------------------------

inline float XMScalarCos(float Value) noexcept
{
    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
    float sign;
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
        sign = -1.0f;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
        sign = -1.0f;
    }
    else
    {
        sign = +1.0f;
    }

    // 10-degree minimax approximation
    float y2 = y * y;
    float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
    return sign * p;
}

//------------------------------------------------------------------------------

inline float XMScalarCosEst(float Value) noexcept
{
    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
    float sign;
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
        sign = -1.0f;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
        sign = -1.0f;
    }
    else
    {
        sign = +1.0f;
    }

    // 6-degree minimax approximation
    float y2 = y * y;
    float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
    return sign * p;
}

//------------------------------------------------------------------------------


inline void XMScalarSinCos
(
    float* pSin,
    float* pCos,
    float  Value
) noexcept
{
    assert(pSin);
    assert(pCos);

    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
    float sign;
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
        sign = -1.0f;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
        sign = -1.0f;
    }
    else
    {
        sign = +1.0f;
    }

    float y2 = y * y;

    // 11-degree minimax approximation
    *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;

    // 10-degree minimax approximation
    float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
    *pCos = sign * p;
}

//------------------------------------------------------------------------------


inline void XMScalarSinCosEst
(
    float* pSin,
    float* pCos,
    float  Value
) noexcept
{
    assert(pSin);
    assert(pCos);

    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
    float quotient = XM_1DIV2PI * Value;
    if (Value >= 0.0f)
    {
        quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
    }
    else
    {
        quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
    }
    float y = Value - XM_2PI * quotient;

    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
    float sign;
    if (y > XM_PIDIV2)
    {
        y = XM_PI - y;
        sign = -1.0f;
    }
    else if (y < -XM_PIDIV2)
    {
        y = -XM_PI - y;
        sign = -1.0f;
    }
    else
    {
        sign = +1.0f;
    }

    float y2 = y * y;

    // 7-degree minimax approximation
    *pSin = (((-0.00018524670f * y2 + 0.0083139502f) * y2 - 0.16665852f) * y2 + 1.0f) * y;

    // 6-degree minimax approximation
    float p = ((-0.0012712436f * y2 + 0.041493919f) * y2 - 0.49992746f) * y2 + 1.0f;
    *pCos = sign * p;
}

//------------------------------------------------------------------------------

inline float XMScalarASin(float Value) noexcept
{
    // Clamp input to [-1,1].
    bool nonnegative = (Value >= 0.0f);
    float x = fabsf(Value);
    float omx = 1.0f - x;
    if (omx < 0.0f)
    {
        omx = 0.0f;
    }
    float root = sqrtf(omx);

    // 7-degree minimax approximation
    float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f;
    result *= root;  // acos(|x|)

    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
}

//------------------------------------------------------------------------------

inline float XMScalarASinEst(float Value) noexcept
{
    // Clamp input to [-1,1].
    bool nonnegative = (Value >= 0.0f);
    float x = fabsf(Value);
    float omx = 1.0f - x;
    if (omx < 0.0f)
    {
        omx = 0.0f;
    }
    float root = sqrtf(omx);

    // 3-degree minimax approximation
    float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
    result *= root;  // acos(|x|)

    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
}

//------------------------------------------------------------------------------

inline float XMScalarACos(float Value) noexcept
{
    // Clamp input to [-1,1].
    bool nonnegative = (Value >= 0.0f);
    float x = fabsf(Value);
    float omx = 1.0f - x;
    if (omx < 0.0f)
    {
        omx = 0.0f;
    }
    float root = sqrtf(omx);

    // 7-degree minimax approximation
    float result = ((((((-0.0012624911f * x + 0.0066700901f) * x - 0.0170881256f) * x + 0.0308918810f) * x - 0.0501743046f) * x + 0.0889789874f) * x - 0.2145988016f) * x + 1.5707963050f;
    result *= root;

    // acos(x) = pi - acos(-x) when x < 0
    return (nonnegative ? result : XM_PI - result);
}

//------------------------------------------------------------------------------

inline float XMScalarACosEst(float Value) noexcept
{
    // Clamp input to [-1,1].
    bool nonnegative = (Value >= 0.0f);
    float x = fabsf(Value);
    float omx = 1.0f - x;
    if (omx < 0.0f)
    {
        omx = 0.0f;
    }
    float root = sqrtf(omx);

    // 3-degree minimax approximation
    float result = ((-0.0187293f * x + 0.0742610f) * x - 0.2121144f) * x + 1.5707288f;
    result *= root;

    // acos(x) = pi - acos(-x) when x < 0
    return (nonnegative ? result : XM_PI - result);
}


#ifdef __clang__
#pragma clang diagnostic pop
#endif
#ifdef _PREFAST_
#pragma prefast(pop)
#endif
#ifdef _MSC_VER
#pragma warning(pop)
#endif

} // namespace DirectX
